The goal of this script is to generate a Seurat object for sample 2022_02.

  • removal of cells based on quality control metrics
  • normalization with LogNormalize, then doublets detection using scran hybrid and scDblFinder method, and doublet cells removal
  • normalization with LogNormalize, for only the remaining cells
  • cell cycle and cell type annotation
  • dimensionality reduction using PCA
  • projection using tSNE and UMAP
library(dplyr)
library(patchwork)
library(ggplot2)

.libPaths()
## [1] "/usr/local/lib/R/library"

Preparation

In this section, we set the global settings of the analysis. We will store data there :

out_dir = "."

We load the parameters :

sample_name = params$sample_name # "2021_31"
# sample_name = "2021_31"

Input count matrix is there :

count_matrix_dir = paste0(out_dir, "/input/", sample_name)

We load the markers and specific colors for each cell type :

cell_markers = readRDS(paste0(out_dir, "/../1_metadata/hs_hd_cell_markers.rds"))
cell_markers = lapply(cell_markers, FUN = toupper)
lengths(cell_markers)
##      CD4 T cells      CD8 T cells Langerhans cells      macrophages 
##               13               13                9               10 
##          B cells          cuticle           cortex          medulla 
##               16               15               16               10 
##              IRS    proliferative              IBL              ORS 
##               16               20               15               16 
##              IFE             HFSC      melanocytes        sebocytes 
##               17               17               10                8

Here are custom colors for each cell type :

color_markers = readRDS(paste0(out_dir, "/../1_metadata/hs_hd_color_markers.rds"))

data.frame(cell_type = names(color_markers),
           color = unlist(color_markers)) %>%
  ggplot2::ggplot(., aes(x = cell_type, y = 0, fill = cell_type)) +
  ggplot2::geom_point(pch = 21, size = 5) +
  ggplot2::scale_fill_manual(values = unlist(color_markers), breaks = names(color_markers)) +
  ggplot2::theme_classic() +
  ggplot2::theme(legend.position = "none",
                 axis.line = element_blank(),
                 axis.title = element_blank(),
                 axis.ticks = element_blank(),
                 axis.text.y = element_blank())

We load markers to display on the dotplot :

dotplot_markers = readRDS(paste0(out_dir, "/../1_metadata/hs_hd_dotplot_markers.rds"))
dotplot_markers = lapply(dotplot_markers, FUN = toupper)
dotplot_markers
## $`CD4 T cells`
## [1] "CD3E" "CD4" 
## 
## $`CD8 T cells`
## [1] "CD3E" "CD8A"
## 
## $`Langerhans cells`
## [1] "CD207" "CPVL" 
## 
## $macrophages
## [1] "TREM2" "MSR1" 
## 
## $`B cells`
## [1] "CD79A" "CD79B"
## 
## $cuticle
## [1] "KRT32" "KRT35"
## 
## $cortex
## [1] "KRT31" "PRR9" 
## 
## $medulla
## [1] "BAMBI"   "ADLH1A3"
## 
## $IRS
## [1] "KRT71" "KRT73"
## 
## $proliferative
## [1] "TOP2A" "MCM5" 
## 
## $IBL
## [1] "KRT16" "KRT6C"
## 
## $ORS
## [1] "KRT15" "GPX2" 
## 
## $IFE
## [1] "SPINK5" "LY6D"  
## 
## $HFSC
## [1] "DIO2"   "TCEAL2"
## 
## $melanocytes
## [1] "DCT"   "MLANA"
## 
## $sebocytes
## [1] "CLMP"  "PPARG"

We load metadata for this sample :

sample_info = readRDS(paste0(out_dir, "/../1_metadata/hs_hd_sample_info.rds"))
sample_info %>%
  dplyr::filter(project_name == sample_name)
##   project_name sample_type sample_identifier gender   color
## 1      2022_02          HD              HD_2      F #3A5FCD

These is a parameter for different functions :

cl = aquarius::create_parallel_instance(nthreads = 3L)
cut_log_nCount_RNA = 6
cut_nFeature_RNA = 500
cut_percent.mt = 20
cut_percent.rb = 50

Load count matrix

In this section, we load the raw count matrix. Then, we applied an empty droplets filtering.

sobj = aquarius::load_sc_data(data_path = count_matrix_dir,
                              sample_name = sample_name,
                              my_seed = 1337L)
## [1]   27955 6794880
## [1] 49660216
## [1] 27955  3612
## [1] 44875419
## [1] 0.9036493
sobj
## An object of class Seurat 
## 27955 features across 3612 samples within 1 assay 
## Active assay: RNA (27955 features, 0 variable features)

(Time to run : 123.93 s)

In genes metadata, we add the Ensembl ID. The sobj@assays$RNA@meta.features dataframe contains three information :

  • rownames : gene names stored as the dimnames of the count matrix. Duplicated gene names will have a .1 at the end of their name
  • Ensembl_ID : EnsemblID, as stored in the features.tsv.gz file
  • gene_name : gene_name, as stored in the features.tsv.gz file. Duplicated gene names will have the same name.
features_df = read.csv(paste0(count_matrix_dir, "/features.tsv.gz"), sep = "\t", header = 0)
features_df = features_df[, c(1:2)]
colnames(features_df) = c("Ensembl_ID", "gene_name")
rownames(features_df) = rownames(sobj) # mandatory for Seurat::FindVariableFeatures

sobj@assays$RNA@meta.features = features_df
rm(features_df)

head(sobj@assays$RNA@meta.features)
##                  Ensembl_ID   gene_name
## MIR1302-2HG ENSG00000243485 MIR1302-2HG
## FAM138A     ENSG00000237613     FAM138A
## OR4F5       ENSG00000186092       OR4F5
## AL627309.1  ENSG00000238009  AL627309.1
## AL627309.3  ENSG00000239945  AL627309.3
## AL627309.4  ENSG00000241599  AL627309.4

We add the same columns as in metadata :

row_oi = (sample_info$project_name == sample_name)

sobj$project_name = sample_name
sobj$sample_identifier = sample_info[row_oi, "sample_identifier"]
sobj$sample_type = sample_info[row_oi, "sample_type"]

colnames(sobj@meta.data)
## [1] "orig.ident"        "nCount_RNA"        "nFeature_RNA"     
## [4] "log_nCount_RNA"    "project_name"      "sample_identifier"
## [7] "sample_type"

Before filtering

Normalization

sobj = Seurat::NormalizeData(sobj,
                             normalization.method = "LogNormalize",
                             assay = "RNA")

sobj = Seurat::FindVariableFeatures(sobj,
                                    assay = "RNA",
                                    nfeatures = 3000)
sobj
## An object of class Seurat 
## 27955 features across 3612 samples within 1 assay 
## Active assay: RNA (27955 features, 3000 variable features)

Projection

We generate a tSNE to visualize cells before filtering.

sobj = aquarius::dimensions_reduction(sobj = sobj,
                                      assay = "RNA",
                                      reduction = "pca",
                                      max_dims = 100,
                                      verbose = FALSE)
Seurat::ElbowPlot(sobj, ndims = 100, reduction = "RNA_pca")

We generate a tSNE with 20 principal components :

ndims = 20
sobj = Seurat::RunTSNE(sobj,
                       reduction = "RNA_pca",
                       dims = 1:ndims,
                       seed.use = 1337L,
                       reduction.name = paste0("RNA_pca_", ndims, "_tsne"))

sobj
## An object of class Seurat 
## 27955 features across 3612 samples within 1 assay 
## Active assay: RNA (27955 features, 3000 variable features)
##  2 dimensional reductions calculated: RNA_pca, RNA_pca_20_tsne

Cell type

We annotate cells for cell type using Seurat::AddModuleScore function.

sobj = aquarius::cell_annot_custom(sobj,
                                   newname = "cell_type",
                                   markers = cell_markers,
                                   use_negative = TRUE,
                                   add_score = TRUE,
                                   verbose = TRUE)

colnames(sobj@meta.data) = stringr::str_replace_all(string = colnames(sobj@meta.data),
                                                    pattern = " ",
                                                    replacement = "_")

sobj$cell_type = factor(sobj$cell_type, levels = names(cell_markers))

table(sobj$cell_type)
## 
##      CD4 T cells      CD8 T cells Langerhans cells      macrophages 
##              141               75               55               50 
##          B cells          cuticle           cortex          medulla 
##               40              475               78              233 
##              IRS    proliferative              IBL              ORS 
##              149              699              222              383 
##              IFE             HFSC      melanocytes        sebocytes 
##               81              291              609               31

(Time to run : 26.44 s)

To justify cell type annotation, we can make a dotplot :

markers = c("PTPRC", "MSX2", "KRT16",
            unique(unlist(dotplot_markers[levels(sobj$cell_type)])))
markers = markers[markers %in% rownames(sobj)]

aquarius::plot_dotplot(sobj, assay = "RNA",
                       column_name = "cell_type",
                       markers = markers,
                       nb_hline = 0) +
  ggplot2::scale_color_gradientn(colors = aquarius:::color_gene) +
  ggplot2::theme(legend.position = "right",
                 legend.box = "vertical",
                 legend.direction = "vertical",
                 axis.title = element_blank(),
                 axis.text = element_text(size = 15))

We can make a barplot to see the composition of each dataset, and visualize cell types on the projection.

df_proportion = as.data.frame(prop.table(table(sobj$orig.ident,
                                               sobj$cell_type)))
colnames(df_proportion) = c("orig.ident", "cell_type", "freq")

quantif = table(sobj$orig.ident) %>%
  as.data.frame.table() %>%
  `colnames<-`(c("orig.ident", "nb_cells"))

# Plot
plot_list = list()

plot_list[[2]] = aquarius::plot_barplot(df = df_proportion,
                                        x = "orig.ident",
                                        y = "freq",
                                        fill = "cell_type",
                                        position = ggplot2::position_fill()) +
  ggplot2::scale_fill_manual(name = "Cell type",
                             values = color_markers[levels(df_proportion$cell_type)],
                             breaks = levels(df_proportion$cell_type)) +
  ggplot2::geom_label(data = quantif, inherit.aes = FALSE,
                      aes(x = orig.ident, y = 1.05, label = nb_cells),
                      label.size = 0)

plot_list[[1]] = Seurat::DimPlot(sobj, group.by = "cell_type") +
  ggplot2::scale_color_manual(values = unlist(color_markers),
                              breaks = names(color_markers)) +
  ggplot2::labs(title = sample_name,
                subtitle = paste0(ncol(sobj), " cells")) +
  Seurat::NoLegend() + Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

patchwork::wrap_plots(plot_list, nrow = 1, widths = c(6, 1))

Cell cycle phase

We annotate cells for cell cycle phase using Seurat and cyclone.

cc_columns = aquarius::add_cell_cycle(sobj = sobj,
                                      assay = "RNA",
                                      species_rdx = "hs",
                                      BPPARAM = cl)@meta.data[, c("Seurat.Phase", "Phase")]
## 
##   G1  G2M    S 
## 1524  620 1467
sobj$Seurat.Phase = cc_columns$Seurat.Phase
sobj$cyclone.Phase = cc_columns$Phase

table(sobj$Seurat.Phase, sobj$cyclone.Phase)
##      
##         G1  G2M    S
##   G1  1007  291  928
##   G2M  202  270  101
##   S    315   59  438

(Time to run : 327.6 s)

We visualize cell cycle on the projection :

plot_list = list()

plot_list[[2]] = Seurat::DimPlot(sobj, group.by = "Seurat.Phase") +
  ggplot2::labs(title = "Cell Cycle Phase",
                subtitle = "Seurat.Phase") +
  Seurat::NoLegend() + Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

plot_list[[1]] = Seurat::DimPlot(sobj, group.by = "cyclone.Phase") +
  ggplot2::labs(title = "Cell Cycle Phase",
                subtitle = "cyclone.Phase") +
  Seurat::NoLegend() + Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

patchwork::wrap_plots(plot_list, nrow = 1)

Quality control

In this section, we look at the number of genes expressed by each cell, the number of UMI, the percentage of mitochondrial genes expressed, and the percentage of ribosomal genes expressed. Then, without taking into account the cells expressing low number of genes or have low number of UMI, we identify doublet cells.

We compute four quality metrics :

sobj = Seurat::PercentageFeatureSet(sobj, pattern = "^MT", col.name = "percent.mt")
sobj = Seurat::PercentageFeatureSet(sobj, pattern = "^RP[L|S][0-9]*$", col.name = "percent.rb")

head(sobj@meta.data)
##                    orig.ident nCount_RNA nFeature_RNA log_nCount_RNA
## AAACCCAAGGGAGGCA-1    2022_02        132          113       4.882802
## AAACCCACATCCTATT-1    2022_02       4408         1079       8.391176
## AAACCCAGTCGTCTCT-1    2022_02        145           98       4.976734
## AAACCCAGTGTAGGAC-1    2022_02      36306         5654      10.499738
## AAACGAACAACCAATC-1    2022_02       4347         1924       8.377241
## AAACGAACAAGTATCC-1    2022_02      17586         4102       9.774858
##                    project_name sample_identifier sample_type score_CD4_T_cells
## AAACCCAAGGGAGGCA-1      2022_02              HD_2          HD       -0.01751676
## AAACCCACATCCTATT-1      2022_02              HD_2          HD       -0.07039404
## AAACCCAGTCGTCTCT-1      2022_02              HD_2          HD       -0.01371429
## AAACCCAGTGTAGGAC-1      2022_02              HD_2          HD       -0.12999860
## AAACGAACAACCAATC-1      2022_02              HD_2          HD        0.04670195
## AAACGAACAAGTATCC-1      2022_02              HD_2          HD       -0.08027089
##                    score_CD8_T_cells score_Langerhans_cells score_macrophages
## AAACCCAAGGGAGGCA-1      -0.013660589            -0.02761229      -0.004617715
## AAACCCACATCCTATT-1      -0.052365793            -0.03726479      -0.019321763
## AAACCCAGTCGTCTCT-1      -0.006684504            -0.02789566       0.000000000
## AAACCCAGTGTAGGAC-1      -0.094085500            -0.07992263      -0.034099624
## AAACGAACAACCAATC-1      -0.120047718             1.30086554       0.280247068
## AAACGAACAAGTATCC-1      -0.084180891            -0.07394105       0.007045939
##                    score_B_cells score_cuticle score_cortex score_medulla
## AAACCCAAGGGAGGCA-1   0.000000000   -0.10869883  -0.02437153   -0.07331642
## AAACCCACATCCTATT-1  -0.004136588   -0.20339989   0.02566296   -0.20152377
## AAACCCAGTCGTCTCT-1   0.000000000   -0.05856797  -0.02923608   -0.04171171
## AAACCCAGTGTAGGAC-1  -0.006615494    0.55064212   0.03549632   -0.14114289
## AAACGAACAACCAATC-1  -0.016571413   -0.28599820  -0.15454661   -0.35797885
## AAACGAACAAGTATCC-1  -0.006659577   -0.33184117  -0.09560058   -0.27248035
##                       score_IRS score_proliferative   score_IBL   score_ORS
## AAACCCAAGGGAGGCA-1 -0.026075167         -0.03545802 -0.04777447  0.19477642
## AAACCCACATCCTATT-1 -0.085342508         -0.14121175  0.10708811  0.06384176
## AAACCCAGTCGTCTCT-1 -0.008720768         -0.02602588 -0.02111632 -0.05479933
## AAACCCAGTGTAGGAC-1 -0.061051817          0.68749458  0.01441245 -0.46298881
## AAACGAACAACCAATC-1 -0.179925686         -0.25546473 -0.24296155 -0.34843705
## AAACGAACAAGTATCC-1 -0.057478584         -0.19326806  0.28763750  0.16860190
##                      score_IFE  score_HFSC score_melanocytes score_sebocytes
## AAACCCAAGGGAGGCA-1 -0.01910979  0.17459906        0.38302816     -0.01640045
## AAACCCACATCCTATT-1  0.12074124 -0.19318666       -0.26786430     -0.06530349
## AAACCCAGTCGTCTCT-1 -0.01335850 -0.05727866       -0.07317189     -0.02675065
## AAACCCAGTGTAGGAC-1 -0.07085904 -0.33237672       -0.66517358     -0.05444636
## AAACGAACAACCAATC-1 -0.03453352 -0.22556268       -0.44869136      0.03282660
## AAACGAACAAGTATCC-1 -0.02356890  0.73492986       -0.33602596     -0.01420379
##                           cell_type Seurat.Phase cyclone.Phase percent.mt
## AAACCCAAGGGAGGCA-1      melanocytes            S            G1  2.2727273
## AAACCCACATCCTATT-1              IFE           G1             S 58.5299456
## AAACCCAGTCGTCTCT-1      macrophages           G1           G2M 34.4827586
## AAACCCAGTGTAGGAC-1    proliferative          G2M           G2M  5.2140142
## AAACGAACAACCAATC-1 Langerhans cells           G1             S  0.4600874
## AAACGAACAAGTATCC-1             HFSC            S             S  3.4004322
##                    percent.rb
## AAACCCAAGGGAGGCA-1  18.181818
## AAACCCACATCCTATT-1   1.202359
## AAACCCAGTCGTCTCT-1   2.068966
## AAACCCAGTGTAGGAC-1  24.285242
## AAACGAACAACCAATC-1   8.488613
## AAACGAACAAGTATCC-1  21.903787

We get the cell barcodes for the failing cells :

fail_percent.mt = sobj@meta.data %>% dplyr::filter(percent.mt > cut_percent.mt) %>% rownames()
fail_percent.rb = sobj@meta.data %>% dplyr::filter(percent.rb > cut_percent.rb) %>% rownames()
fail_log_nCount_RNA = sobj@meta.data %>% dplyr::filter(log_nCount_RNA < cut_log_nCount_RNA) %>% rownames()
fail_nFeature_RNA = sobj@meta.data %>% dplyr::filter(nFeature_RNA < cut_nFeature_RNA) %>% rownames()

Doublet cells detection

Without taking into account the low UMI and low number of features cells, we identify doublets.

fsobj = subset(sobj, invert = TRUE,
               cells = unique(c(fail_log_nCount_RNA, fail_nFeature_RNA)))
fsobj
## An object of class Seurat 
## 27955 features across 2858 samples within 1 assay 
## Active assay: RNA (27955 features, 3000 variable features)
##  2 dimensional reductions calculated: RNA_pca, RNA_pca_20_tsne

On this filtered dataset, we apply doublet cells detection. Just before, we run the normalization, taking into account only the remaining cells.

sobj = Seurat::NormalizeData(sobj,
                             normalization.method = "LogNormalize",
                             assay = "RNA")

sobj = Seurat::FindVariableFeatures(sobj,
                                    assay = "RNA",
                                    nfeatures = 3000)
sobj
## An object of class Seurat 
## 27955 features across 3612 samples within 1 assay 
## Active assay: RNA (27955 features, 3000 variable features)
##  2 dimensional reductions calculated: RNA_pca, RNA_pca_20_tsne

We identify doublet cells :

fsobj = aquarius::find_doublets(sobj = fsobj,
                                BPPARAM = cl)
## [1] 27955  2858
## 
## FALSE  TRUE 
##  2654   204 
## [17:24:45] WARNING: amalgamation/../src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
## 
## FALSE  TRUE 
##  2582   276 
## 
## FALSE  TRUE 
##  2463   395
fail_doublets_consensus = Seurat::WhichCells(fsobj, expression = doublets_consensus.class)
fail_doublets_scDblFinder = Seurat::WhichCells(fsobj, expression = scDblFinder.class)
fail_doublets_hybrid = Seurat::WhichCells(fsobj, expression = hybrid_score.class)

(Time to run : 92.99 s)

We add the information in the non filtered Seurat object :

sobj$doublets_consensus.class = dplyr::case_when(!(colnames(sobj) %in% colnames(fsobj)) ~ NA,
                                                 colnames(sobj) %in% fail_doublets_consensus ~ TRUE,
                                                 !(colnames(sobj) %in% fail_doublets_consensus) ~ FALSE)

sobj$scDblFinder.class = dplyr::case_when(!(colnames(sobj) %in% colnames(fsobj)) ~ NA,
                                          colnames(sobj) %in% fail_doublets_scDblFinder ~ TRUE,
                                          !(colnames(sobj) %in% fail_doublets_scDblFinder) ~ FALSE)

sobj$hybrid_score.class = dplyr::case_when(!(colnames(sobj) %in% colnames(fsobj)) ~ NA,
                                           colnames(sobj) %in% fail_doublets_hybrid ~ TRUE,
                                           !(colnames(sobj) %in% fail_doublets_hybrid) ~ FALSE)

Quality control representation

We can visualize the 4 cells quality with a Venn diagram :

n_filtered = c(fail_percent.mt, fail_percent.rb, fail_log_nCount_RNA, fail_nFeature_RNA) %>%
  unique() %>% length()
percent_filtered = round(100*(n_filtered/ncol(sobj)), 2)

ggvenn::ggvenn(list(percent.mt = fail_percent.mt,
                    percent.rb = fail_percent.rb,
                    log_nCount_RNA = fail_log_nCount_RNA,
                    nFeature_RNA = fail_nFeature_RNA), 
               fill_color = c("#0073C2FF", "#EFC000FF", "orange", "pink"),
               stroke_size = 0.5, set_name_size = 4) +
  ggplot2::labs(title = "Filtered out cells",
                subtitle = paste0(n_filtered, " cells (", percent_filtered, " % of all cells)")) +
  ggplot2::theme(plot.title = element_text(hjust = 0.5, face = "bold"),
                 plot.subtitle = element_text(hjust = 0.5))

Number of UMI

To visualize the threshold for number of UMI, we can make a histogram :

aquarius::plot_qc_density(df = sobj@meta.data,
                          x = "log_nCount_RNA",
                          bins = 200,
                          group_by = "orig.ident",
                          group_color = setNames(sample_info$color,
                                                 nm = sample_info$sample_identifiant),
                          x_thresh = cut_log_nCount_RNA)

Seurat::VlnPlot(sobj, features = "log_nCount_RNA", pt.size = 0.001,
                group.by = "cell_type", cols = color_markers) +
  ggplot2::scale_fill_manual(values = color_markers, breaks = names(color_markers)) +
  ggplot2::geom_hline(yintercept = cut_log_nCount_RNA, col = "red") +
  ggplot2::labs(x = "")

sobj$fail = ifelse(colnames(sobj) %in% fail_log_nCount_RNA,
                   yes = as.character(sobj$cell_type), no = NA)
sobj$fail = factor(sobj$fail, levels = c(levels(sobj$cell_type), NA))

Seurat::DimPlot(sobj, group.by = "fail", na.value = "gray80", cols = color_markers) +
  ggplot2::labs(title = "log_nCount_RNA",
                subtitle = paste0(length(fail_log_nCount_RNA), " cells")) +
  Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

Number of features

To visualize the threshold for number of features, we can make a histogram :

aquarius::plot_qc_density(df = sobj@meta.data,
                          x = "nFeature_RNA",
                          bins = 200,
                          group_by = "orig.ident",
                          group_color = setNames(sample_info$color,
                                                 nm = sample_info$sample_identifiant),
                          x_thresh = cut_nFeature_RNA)

Seurat::VlnPlot(sobj, features = "nFeature_RNA", pt.size = 0.001,
                group.by = "cell_type", cols = color_markers) +
  ggplot2::scale_fill_manual(values = color_markers, breaks = names(color_markers)) +
  ggplot2::geom_hline(yintercept = cut_nFeature_RNA, col = "red") +
  ggplot2::labs(x = "")

sobj$fail = ifelse(colnames(sobj) %in% fail_nFeature_RNA,
                   yes = as.character(sobj$cell_type), no = NA)
sobj$fail = factor(sobj$fail, levels = c(levels(sobj$cell_type), NA))

Seurat::DimPlot(sobj, group.by = "fail", na.value = "gray80", cols = color_markers) +
  ggplot2::labs(title = "nFeature_RNA",
                subtitle = paste0(length(fail_nFeature_RNA), " cells")) +
  Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

Mitochondrial genes expression

To identify a threshold for mitochondrial gene expression, we can make a histogram :

aquarius::plot_qc_density(df = sobj@meta.data,
                          x = "percent.mt",
                          bins = 200,
                          group_by = "orig.ident",
                          group_color = setNames(sample_info$color,
                                                 nm = sample_info$sample_identifiant),
                          x_thresh = cut_percent.mt)

Seurat::VlnPlot(sobj, features = "percent.mt", pt.size = 0.001,
                group.by = "cell_type", cols = color_markers) +
  ggplot2::scale_fill_manual(values = color_markers, breaks = names(color_markers)) +
  ggplot2::geom_hline(yintercept = cut_percent.mt, col = "red") +
  ggplot2::labs(x = "")

sobj$fail = ifelse(colnames(sobj) %in% fail_percent.mt,
                   yes = as.character(sobj$cell_type), no = NA)
sobj$fail = factor(sobj$fail, levels = c(levels(sobj$cell_type), NA))

Seurat::DimPlot(sobj, group.by = "fail", na.value = "gray80", cols = color_markers) +
  ggplot2::labs(title = "percent.mt",
                subtitle = paste0(length(fail_percent.mt), " cells")) +
  Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

Ribosomal genes expression

To identify a threshold for ribosomal gene expression, we can make a histogram :

aquarius::plot_qc_density(df = sobj@meta.data,
                          x = "percent.rb",
                          bins = 200,
                          group_by = "orig.ident",
                          group_color = setNames(sample_info$color,
                                                 nm = sample_info$sample_identifiant),
                          x_thresh = cut_percent.rb)

Seurat::VlnPlot(sobj, features = "percent.rb", pt.size = 0.001,
                group.by = "cell_type", cols = color_markers) +
  ggplot2::scale_fill_manual(values = color_markers, breaks = names(color_markers)) +
  ggplot2::geom_hline(yintercept = cut_percent.rb, col = "red") +
  ggplot2::labs(x = "")

sobj$fail = ifelse(colnames(sobj) %in% fail_percent.rb,
                   yes = as.character(sobj$cell_type), no = NA)
sobj$fail = factor(sobj$fail, levels = c(levels(sobj$cell_type), NA))

Seurat::DimPlot(sobj, group.by = "fail", na.value = "gray80", cols = color_markers) +
  ggplot2::labs(title = "percent.rb",
                subtitle = paste0(length(fail_percent.rb), " cells")) +
  Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

FACS-like figure

We would like to see if the number of feature expressed by cell, and the number of UMI is correlated with the cell type, the percentage of mitochondrial and ribosomal gene expressed, and the doublet status. We build the log_nCount_RNA by nFeature_RNA figure, where cells (dots) are colored by these different metrics.

This is the figure, colored by cell type :

aquarius::plot_qc_facslike(df = sobj@meta.data,
                           x = "nFeature_RNA",
                           y = "log_nCount_RNA",
                           col_by = "cell_type",
                           col_colors = unname(color_markers),
                           x_thresh = cut_nFeature_RNA,
                           y_thresh = cut_log_nCount_RNA,
                           bins = 200)

This is the figure, colored by the percentage of mitochondrial genes expressed in cell :

aquarius::plot_qc_facslike(df = sobj@meta.data,
                           x = "nFeature_RNA",
                           y = "log_nCount_RNA",
                           col_by = "percent.mt",
                           x_thresh = cut_nFeature_RNA,
                           y_thresh = cut_log_nCount_RNA,
                           bins = 200)

This is the figure, colored by the percentage of ribosomal genes expressed in cell :

aquarius::plot_qc_facslike(df = sobj@meta.data,
                           x = "nFeature_RNA",
                           y = "log_nCount_RNA",
                           col_by = "percent.rb",
                           x_thresh = cut_nFeature_RNA,
                           y_thresh = cut_log_nCount_RNA,
                           bins = 200)

This is the figure, colored by the doublet cells status (doublets_consensus.class) :

aquarius::plot_qc_facslike(df = sobj@meta.data,
                           x = "nFeature_RNA",
                           y = "log_nCount_RNA",
                           col_by = "doublets_consensus.class",
                           col_colors = setNames(nm = c(TRUE, FALSE),
                                                 aquarius::gg_color_hue(2)),
                           x_thresh = cut_nFeature_RNA,
                           y_thresh = cut_log_nCount_RNA,
                           bins = 200)

This is the figure, colored by the doublet cells status (scDblFinder.class) :

aquarius::plot_qc_facslike(df = sobj@meta.data,
                           x = "nFeature_RNA",
                           y = "log_nCount_RNA",
                           col_by = "scDblFinder.class",
                           col_colors = setNames(nm = c(TRUE, FALSE),
                                                 aquarius::gg_color_hue(2)),
                           x_thresh = cut_nFeature_RNA,
                           y_thresh = cut_log_nCount_RNA,
                           bins = 200)

This is the figure, colored by the doublet cells status (hybrid_score.class) :

aquarius::plot_qc_facslike(df = sobj@meta.data,
                           x = "nFeature_RNA",
                           y = "log_nCount_RNA",
                           col_by = "hybrid_score.class",
                           col_colors = setNames(nm = c(TRUE, FALSE),
                                                 aquarius::gg_color_hue(2)),
                           x_thresh = cut_nFeature_RNA,
                           y_thresh = cut_log_nCount_RNA,
                           bins = 200)

Visualization as piechart

Do filtered cells belong to a particular cell type ?

sobj$all_cells = TRUE

plot_list = list()

## All cells
df = sobj@meta.data
if (nrow(df) == 0) {
  plot_list[[1]] = ggplot()
} else {
  plot_list[[1]] = aquarius::plot_piechart(df = df,
                                           logical_var = "all_cells",
                                           grouping_var = "cell_type",
                                           colors = color_markers,
                                           display_legend = TRUE) +
    ggplot2::labs(title = "All cells",
                  subtitle = paste(nrow(df), "cells")) +
    ggplot2::theme(plot.title = element_text(hjust = 0.5, face = "bold"),
                   plot.subtitle = element_text(hjust = 0.5))
}

## Doublets consensus
df = sobj@meta.data %>%
  dplyr::filter(doublets_consensus.class)
if (nrow(df) == 0) {
  plot_list[[2]] = ggplot()
} else {
  plot_list[[2]] = aquarius::plot_piechart(df = df,
                                           logical_var = "all_cells",
                                           grouping_var = "cell_type",
                                           colors = color_markers,
                                           display_legend = TRUE) +
    ggplot2::labs(title = "doublets_consensus.class",
                  subtitle = paste(sum(sobj$doublets_consensus.class, na.rm = TRUE), "cells")) +
    ggplot2::theme(plot.title = element_text(hjust = 0.5, face = "bold"),
                   plot.subtitle = element_text(hjust = 0.5))
}

## percent.mt
df = sobj@meta.data %>%
  dplyr::filter(percent.mt > cut_percent.mt)
if (nrow(df) == 0) {
  plot_list[[3]] = ggplot()
} else {
  plot_list[[3]] = aquarius::plot_piechart(df = df,
                                           logical_var = "all_cells",
                                           grouping_var = "cell_type",
                                           colors = color_markers,
                                           display_legend = TRUE) +
    ggplot2::labs(title = paste("percent.mt >", cut_percent.mt),
                  subtitle = paste(length(fail_percent.mt), "cells")) +
    ggplot2::theme(plot.title = element_text(hjust = 0.5, face = "bold"),
                   plot.subtitle = element_text(hjust = 0.5))
}

## percent.rb
df = sobj@meta.data %>%
  dplyr::filter(percent.rb > cut_percent.rb)
if (nrow(df) == 0) {
  plot_list[[4]] = ggplot()
} else {
  plot_list[[4]] = aquarius::plot_piechart(df = df,
                                           logical_var = "all_cells",
                                           grouping_var = "cell_type",
                                           colors = color_markers,
                                           display_legend = TRUE) +
    ggplot2::labs(title = paste("percent.rb >", cut_percent.rb),
                  subtitle = paste(length(fail_percent.rb), "cells")) +
    ggplot2::theme(plot.title = element_text(hjust = 0.5, face = "bold"),
                   plot.subtitle = element_text(hjust = 0.5))
}

## log_nCount_RNA
df = sobj@meta.data %>%
  dplyr::filter(log_nCount_RNA < cut_log_nCount_RNA)
if (nrow(df) == 0) {
  plot_list[[5]] = ggplot()
} else {
  plot_list[[5]] = aquarius::plot_piechart(df = df,
                                           logical_var = "all_cells",
                                           grouping_var = "cell_type",
                                           colors = color_markers,
                                           display_legend = TRUE) +
    ggplot2::labs(title = paste("log_nCount_RNA <", round(cut_log_nCount_RNA, 2)),
                  subtitle = paste(length(fail_log_nCount_RNA), "cells")) +
    ggplot2::theme(plot.title = element_text(hjust = 0.5, face = "bold"),
                   plot.subtitle = element_text(hjust = 0.5))
}

## nFeature_RNA
df = sobj@meta.data %>%
  dplyr::filter(nFeature_RNA < cut_nFeature_RNA)
if (nrow(df) == 0) {
  plot_list[[6]] = ggplot()
} else {
  plot_list[[6]] = aquarius::plot_piechart(df = df,
                                           logical_var = "all_cells",
                                           grouping_var = "cell_type",
                                           colors = color_markers,
                                           display_legend = TRUE) +
    ggplot2::labs(title = paste("nFeature_RNA <", round(cut_nFeature_RNA, 2)),
                  subtitle = paste(length(fail_nFeature_RNA), "cells")) +
    ggplot2::theme(plot.title = element_text(hjust = 0.5, face = "bold"),
                   plot.subtitle = element_text(hjust = 0.5))
}

patchwork::wrap_plots(plot_list, ncol = 3) +
  patchwork::plot_layout(guides = "collect") &
  ggplot2::theme(legend.position = "right")

Doublet cells status

We can compare doublet detection methods with a Venn diagram :

ggvenn::ggvenn(list(hybrid = fail_doublets_hybrid,
                    scDblFinder = fail_doublets_scDblFinder), 
               fill_color = c("#0073C2FF", "#EFC000FF"),
               stroke_size = 0.5, set_name_size = 4) +
  ggplot2::ggtitle(label = "Doublet cells") +
  ggplot2::theme(plot.title = element_text(hjust = 0.5, face = "bold"))

We visualize cells annotation for doublets :

plot_list = list()

# scDblFinder.class
sobj$fail = ifelse(sobj$scDblFinder.class,
                   yes = as.character(sobj$cell_type), no = NA)
sobj$fail = factor(sobj$fail, levels = c(levels(sobj$cell_type), NA))

plot_list[[1]] = Seurat::DimPlot(sobj, group.by = "fail",
                                 na.value = "gray80", cols = color_markers) +
  ggplot2::labs(title = "scDblFinder.class",
                subtitle = paste0(sum(sobj$scDblFinder.class, na.rm = TRUE), " cells")) +
  Seurat::NoAxes() + Seurat::NoLegend() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

# hybrid_score.class
sobj$fail = ifelse(sobj$hybrid_score.class,
                   yes = as.character(sobj$cell_type), no = NA)
sobj$fail = factor(sobj$fail, levels = c(levels(sobj$cell_type), NA))

plot_list[[2]] = Seurat::DimPlot(sobj, group.by = "fail",
                                 na.value = "gray80", cols = color_markers) +
  ggplot2::labs(title = "hybrid_score.class",
                subtitle = paste0(sum(sobj$hybrid_score.class, na.rm = TRUE), " cells")) +
  Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

sobj$fail = NULL

# Plot
patchwork::wrap_plots(plot_list, nrow = 1)

What is the composition of doublet cells ? We just look at score for each cell type.

sobj$orig.ident.doublets = case_when(is.na(sobj$doublets_consensus.class) ~ "bad quality",
                                     sobj$doublets_consensus.class == TRUE ~ paste0(sobj$orig.ident, " doublets"),
                                     sobj$doublets_consensus.class == FALSE ~ "not doublet")
sobj$orig.ident.doublets = factor(sobj$orig.ident.doublets,
                                  levels = c(paste0(as.character(sample_info$sample_identifiant), " doublets"),
                                             "bad quality", "not doublet"))

doublets_compo = function(score1, score2) {
  type1 = unlist(lapply(stringr::str_split(score1, pattern = "score_"), `[[`, 2))
  type2 = unlist(lapply(stringr::str_split(score2, pattern = "score_"), `[[`, 2))
  
  if (type1 == type2) {
    the_title = "Homotypic doublet"
    the_subtitle = type1
    score1 = "log_nCount_RNA"
  } else {
    the_title = "Heterotypic doublet"
    the_subtitle = paste(type1, type2, sep = " + ")
  }
  
  p = sobj@meta.data %>%
    dplyr::arrange(desc(orig.ident.doublets)) %>%
    ggplot2::ggplot(., aes(x = eval(parse(text = score1)),
                           y = eval(parse(text = score2)),
                           col = orig.ident.doublets)) +
    ggplot2::geom_point(size = 0.25) +
    ggplot2::scale_color_manual(values = c(sample_info$color, "gray90", "gray60"),
                                breaks = c(paste0(as.character(sample_info$sample_identifiant), " doublets"),
                                           "bad quality", "not doublet")) +
    ggplot2::labs(x = score1, y = score2,
                  title = the_title, subtitle = the_subtitle) +
    ggplot2::theme_classic() +
    ggplot2::theme(aspect.ratio = 1,
                   plot.title = element_text(hjust = 0.5),
                   plot.subtitle = element_text(hjust = 0.5))
  
  return(p)
}
score_columns = grep(x = colnames(sobj@meta.data),
                     pattern = "^score",
                     value = TRUE)
combinations = expand.grid(score_columns, score_columns) %>%
  apply(., 1, sort) %>% t() %>%
  as.data.frame()
combinations = combinations[!duplicated(combinations), ]

plot_list = apply(combinations, 1, FUN = function(elem) {
  doublets_compo(elem[1], elem[2])
})

sobj$orig.ident.doublets = NULL
patchwork::wrap_plots(plot_list, ncol = 4) +
  patchwork::plot_layout(guides = "collect") &
  ggplot2::theme(legend.position = "right")
show

Save

We could save this object before filtering (remove eval = FALSE) :

saveRDS(sobj, paste0(out_dir, "/datasets/", sample_name, "_sobj_unfiltered.rds"))

Filtering

We remove :

  • cells with a number of UMI lower than 6
  • cells expressing a number of genes lower than 500
  • cells having more than 20 % of UMI related to mitochondrial genes
  • cells having more than 50 % of UMI related to ribosomal genes
  • doublet cells detected with both method (scDblFinder and scds-hybrid)
sobj = subset(sobj, invert = TRUE,
              cells = unique(c(fail_log_nCount_RNA, fail_nFeature_RNA,
                               fail_percent.mt, fail_percent.rb,
                               fail_doublets_consensus)))
sobj
## An object of class Seurat 
## 27955 features across 2286 samples within 1 assay 
## Active assay: RNA (27955 features, 3000 variable features)
##  2 dimensional reductions calculated: RNA_pca, RNA_pca_20_tsne

Post-filtering processing

Normalization

We normalize the count matrix for remaining cells :

sobj = Seurat::NormalizeData(sobj,
                             normalization.method = "LogNormalize",
                             assay = "RNA")

sobj = Seurat::FindVariableFeatures(sobj,
                                    assay = "RNA",
                                    nfeatures = 3000)
sobj
## An object of class Seurat 
## 27955 features across 2286 samples within 1 assay 
## Active assay: RNA (27955 features, 3000 variable features)
##  2 dimensional reductions calculated: RNA_pca, RNA_pca_20_tsne

Projection

We perform a PCA :

sobj = aquarius::dimensions_reduction(sobj = sobj,
                                      assay = "RNA",
                                      reduction = "pca",
                                      max_dims = 100,
                                      verbose = FALSE)
Seurat::ElbowPlot(sobj, ndims = 100, reduction = "RNA_pca")

We generate a tSNE and a UMAP with 20 principal components :

ndims = 20
sobj = Seurat::RunTSNE(sobj,
                       reduction = "RNA_pca",
                       dims = 1:ndims,
                       seed.use = 1337L,
                       reduction.name = paste0("RNA_pca_", ndims, "_tsne"))

sobj = Seurat::RunUMAP(sobj,
                       reduction = "RNA_pca",
                       dims = 1:ndims,
                       seed.use = 1337L,
                       reduction.name = paste0("RNA_pca_", ndims, "_umap"))

Annotation

We annotate cells for cell type, with the new normalized expression matrix :

score_columns = grep(x = colnames(sobj@meta.data), pattern = "^score", value = TRUE)
sobj@meta.data[, score_columns] = NULL
sobj$cell_type = NULL

sobj = aquarius::cell_annot_custom(sobj,
                                   newname = "cell_type",
                                   markers = cell_markers,
                                   use_negative = TRUE,
                                   add_score = TRUE,
                                   verbose = TRUE)

sobj$cell_type = factor(sobj$cell_type, levels = names(cell_markers))

colnames(sobj@meta.data) = stringr::str_replace_all(string = colnames(sobj@meta.data),
                                                    pattern = " ",
                                                    replacement = "_")

table(sobj$cell_type)
## 
##      CD4 T cells      CD8 T cells Langerhans cells      macrophages 
##              119               65               32               29 
##          B cells          cuticle           cortex          medulla 
##               21              305               59              120 
##              IRS    proliferative              IBL              ORS 
##              103              466              134              272 
##              IFE             HFSC      melanocytes        sebocytes 
##               48              214              276               23

(Time to run : 18.41 s)

To justify cell type annotation, we can make a dotplot :

markers = c("PTPRC", unique(unlist(dotplot_markers[levels(sobj$cell_type)])))
markers = markers[markers %in% rownames(sobj)]

aquarius::plot_dotplot(sobj, assay = "RNA",
                       column_name = "cell_type",
                       markers = markers,
                       nb_hline = 0) +
  ggplot2::scale_color_gradientn(colors = aquarius:::color_gene) +
  ggplot2::theme(legend.position = "right",
                 legend.box = "vertical",
                 legend.direction = "vertical",
                 axis.title = element_blank(),
                 axis.text = element_text(size = 15))

We can make a barplot to see the composition of each dataset, and visualize cell types on the projection.

df_proportion = as.data.frame(prop.table(table(sobj$orig.ident,
                                               sobj$cell_type)))
colnames(df_proportion) = c("orig.ident", "cell_type", "freq")

quantif = table(sobj$orig.ident) %>%
  as.data.frame.table() %>%
  `colnames<-`(c("orig.ident", "nb_cells"))

# Plot
plot_list = list()

plot_list[[2]] = aquarius::plot_barplot(df = df_proportion,
                                        x = "orig.ident",
                                        y = "freq",
                                        fill = "cell_type",
                                        position = ggplot2::position_fill()) +
  ggplot2::scale_fill_manual(name = "Cell type",
                             values = color_markers[levels(df_proportion$cell_type)],
                             breaks = levels(df_proportion$cell_type)) +
  ggplot2::geom_label(data = quantif, inherit.aes = FALSE,
                      aes(x = orig.ident, y = 1.05, label = nb_cells),
                      label.size = 0)

plot_list[[1]] = Seurat::DimPlot(sobj, group.by = "cell_type",
                                 reduction = "RNA_pca_20_tsne") +
  ggplot2::scale_color_manual(values = unlist(color_markers),
                              breaks = names(color_markers)) +
  ggplot2::labs(title = sample_name,
                subtitle = paste0(ncol(sobj), " cells")) +
  Seurat::NoLegend() + Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

patchwork::wrap_plots(plot_list, nrow = 1, widths = c(6, 1))

Cell cycle

We annotate cells for cell cycle phase :

cc_columns = aquarius::add_cell_cycle(sobj = sobj,
                                      assay = "RNA",
                                      species_rdx = "hs",
                                      BPPARAM = cl)@meta.data[, c("Seurat.Phase", "Phase")]
## 
##   G1  G2M    S 
## 1083  258  945
sobj$Seurat.Phase = cc_columns$Seurat.Phase
sobj$cyclone.Phase = cc_columns$Phase

table(sobj$Seurat.Phase, sobj$cyclone.Phase)
##      
##        G1 G2M   S
##   G1  764 112 632
##   G2M 107 130  41
##   S   212  16 272

(Time to run : 240.75 s)

We visualize cell cycle on the projection :

plot_list = list()

plot_list[[2]] = Seurat::DimPlot(sobj, group.by = "Seurat.Phase",
                                 reduction = "RNA_pca_20_tsne") +
  ggplot2::labs(title = "Cell Cycle Phase",
                subtitle = "Seurat.Phase") +
  Seurat::NoLegend() + Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

plot_list[[1]] = Seurat::DimPlot(sobj, group.by = "cyclone.Phase",
                                 reduction = "RNA_pca_20_tsne") +
  ggplot2::labs(title = "Cell Cycle Phase",
                subtitle = "cyclone.Phase") +
  Seurat::NoLegend() + Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

patchwork::wrap_plots(plot_list, nrow = 1)

Clustering

We make a highly resolutive clustering :

sobj = Seurat::FindNeighbors(sobj, reduction = "RNA_pca", dims = c(1:ndims))
sobj = Seurat::FindClusters(sobj, resolution = 2)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 2286
## Number of edges: 67216
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8252
## Number of communities: 22
## Elapsed time: 0 seconds
table(sobj$seurat_clusters)
## 
##   0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19 
## 193 170 167 158 149 144 142 138 129 121 111 102  98  92  89  53  49  48  44  31 
##  20  21 
##  30  28

Visualization

Cell type

We can visualize the cell type :

tsne = Seurat::DimPlot(sobj, group.by = "cell_type",
                       reduction = paste0("RNA_pca_", ndims, "_tsne"), cols = color_markers) +
  Seurat::NoAxes() + ggplot2::ggtitle("tSNE") +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 legend.position = "none")

umap = Seurat::DimPlot(sobj, group.by = "cell_type",
                       reduction = paste0("RNA_pca_", ndims, "_umap"), cols = color_markers) +
  Seurat::NoAxes() + ggplot2::ggtitle("UMAP") +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5))

tsne | umap

Cell cycle

We can visualize the cell cycle, from Seurat :

tsne = Seurat::DimPlot(sobj, group.by = "Seurat.Phase",
                       reduction = paste0("RNA_pca_", ndims, "_tsne")) +
  Seurat::NoAxes() + ggplot2::ggtitle("tSNE") +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 legend.position = "none")

umap = Seurat::DimPlot(sobj, group.by = "Seurat.Phase",
                       reduction = paste0("RNA_pca_", ndims, "_umap")) +
  Seurat::NoAxes() + ggplot2::ggtitle("UMAP") +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5))

tsne | umap

We can visualize the cell cycle, from cyclone :

tsne = Seurat::DimPlot(sobj, group.by = "cyclone.Phase",
                       reduction = paste0("RNA_pca_", ndims, "_tsne")) +
  Seurat::NoAxes() + ggplot2::ggtitle("tSNE") +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 legend.position = "none")

umap = Seurat::DimPlot(sobj, group.by = "cyclone.Phase",
                       reduction = paste0("RNA_pca_", ndims, "_umap")) +
  Seurat::NoAxes() + ggplot2::ggtitle("UMAP") +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5))

tsne | umap

Clusters

We visualize the clustering :

tsne = Seurat::DimPlot(sobj, group.by = "seurat_clusters", label = TRUE,
                       reduction = paste0("RNA_pca_", ndims, "_tsne")) +
  Seurat::NoAxes() + ggplot2::ggtitle("tSNE") +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 legend.position = "none")

umap = Seurat::DimPlot(sobj, group.by = "seurat_clusters", label = TRUE,
                       reduction = paste0("RNA_pca_", ndims, "_umap")) +
  Seurat::NoAxes() + ggplot2::ggtitle("UMAP") +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5))

tsne | umap

Gene expression

We visualize all cell types markers on the tSNE :

markers = dotplot_markers %>% unlist() %>% unname()
markers = markers[markers %in% rownames(sobj)]

plot_list = lapply(markers,
                   FUN = function(one_gene) {
                     p = Seurat::FeaturePlot(sobj, features = one_gene,
                                             reduction = paste0("RNA_pca_", ndims, "_tsne")) +
                       ggplot2::labs(title = one_gene) +
                       ggplot2::scale_color_gradientn(colors = aquarius::color_gene) +
                       ggplot2::theme(aspect.ratio = 1,
                                      plot.subtitle = element_text(hjust = 0.5)) +
                       Seurat::NoAxes()
                     return(p)
                   })

patchwork::wrap_plots(plot_list, ncol = 4)

Save

We save the annotated and filtered Seurat object :

saveRDS(sobj, file = paste0(out_dir, "/datasets/", sample_name, "_sobj_filtered.rds"))

R session

show
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.6 LTS
## 
## Matrix products: default
## BLAS:   /usr/local/lib/R/lib/libRblas.so
## LAPACK: /usr/local/lib/R/lib/libRlapack.so
## 
## locale:
## [1] C
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] ggplot2_3.3.5   patchwork_1.1.2 dplyr_1.0.7    
## 
## loaded via a namespace (and not attached):
##   [1] softImpute_1.4              graphlayouts_0.7.0         
##   [3] pbapply_1.4-2               lattice_0.20-41            
##   [5] haven_2.3.1                 vctrs_0.3.8                
##   [7] usethis_2.0.1               dynwrap_1.2.1              
##   [9] blob_1.2.1                  survival_3.2-13            
##  [11] prodlim_2019.11.13          dynutils_1.0.5             
##  [13] later_1.3.0                 DBI_1.1.1                  
##  [15] R.utils_2.11.0              SingleCellExperiment_1.8.0 
##  [17] rappdirs_0.3.3              uwot_0.1.8                 
##  [19] dqrng_0.2.1                 jpeg_0.1-8.1               
##  [21] zlibbioc_1.32.0             pspline_1.0-18             
##  [23] pcaMethods_1.78.0           mvtnorm_1.1-1              
##  [25] htmlwidgets_1.5.4           GlobalOptions_0.1.2        
##  [27] future_1.22.1               UpSetR_1.4.0               
##  [29] laeken_0.5.2                leiden_0.3.3               
##  [31] clustree_0.4.3              parallel_3.6.3             
##  [33] scater_1.14.6               irlba_2.3.3                
##  [35] DEoptimR_1.0-9              tidygraph_1.1.2            
##  [37] Rcpp_1.0.9                  readr_2.0.2                
##  [39] KernSmooth_2.23-17          carrier_0.1.0              
##  [41] promises_1.1.0              gdata_2.18.0               
##  [43] DelayedArray_0.12.3         limma_3.42.2               
##  [45] graph_1.64.0                RcppParallel_5.1.4         
##  [47] Hmisc_4.4-0                 fs_1.5.2                   
##  [49] RSpectra_0.16-0             fastmatch_1.1-0            
##  [51] ranger_0.12.1               digest_0.6.25              
##  [53] png_0.1-7                   sctransform_0.2.1          
##  [55] cowplot_1.0.0               DOSE_3.12.0                
##  [57] ggvenn_0.1.9                here_1.0.1                 
##  [59] TInGa_0.0.0.9000            ggraph_2.0.3               
##  [61] pkgconfig_2.0.3             GO.db_3.10.0               
##  [63] DelayedMatrixStats_1.8.0    gower_0.2.1                
##  [65] ggbeeswarm_0.6.0            iterators_1.0.12           
##  [67] DropletUtils_1.6.1          reticulate_1.26            
##  [69] clusterProfiler_3.14.3      SummarizedExperiment_1.16.1
##  [71] circlize_0.4.15             beeswarm_0.4.0             
##  [73] GetoptLong_1.0.5            xfun_0.35                  
##  [75] bslib_0.3.1                 zoo_1.8-10                 
##  [77] tidyselect_1.1.0            reshape2_1.4.4             
##  [79] purrr_0.3.4                 ica_1.0-2                  
##  [81] pcaPP_1.9-73                viridisLite_0.3.0          
##  [83] rtracklayer_1.46.0          rlang_1.0.2                
##  [85] hexbin_1.28.1               jquerylib_0.1.4            
##  [87] dyneval_0.9.9               glue_1.4.2                 
##  [89] RColorBrewer_1.1-2          matrixStats_0.56.0         
##  [91] stringr_1.4.0               lava_1.6.7                 
##  [93] europepmc_0.3               DESeq2_1.26.0              
##  [95] recipes_0.1.17              labeling_0.3               
##  [97] httpuv_1.5.2                class_7.3-17               
##  [99] BiocNeighbors_1.4.2         DO.db_2.9                  
## [101] annotate_1.64.0             jsonlite_1.7.2             
## [103] XVector_0.26.0              bit_4.0.4                  
## [105] mime_0.9                    aquarius_0.1.5             
## [107] Rsamtools_2.2.3             gridExtra_2.3              
## [109] gplots_3.0.3                stringi_1.4.6              
## [111] processx_3.5.2              gsl_2.1-6                  
## [113] bitops_1.0-6                cli_3.0.1                  
## [115] batchelor_1.2.4             RSQLite_2.2.0              
## [117] randomForest_4.6-14         tidyr_1.1.4                
## [119] data.table_1.14.2           rstudioapi_0.13            
## [121] org.Mm.eg.db_3.10.0         GenomicAlignments_1.22.1   
## [123] nlme_3.1-147                qvalue_2.18.0              
## [125] scran_1.14.6                locfit_1.5-9.4             
## [127] scDblFinder_1.1.8           listenv_0.8.0              
## [129] ggthemes_4.2.4              gridGraphics_0.5-0         
## [131] R.oo_1.24.0                 dbplyr_1.4.4               
## [133] BiocGenerics_0.32.0         TTR_0.24.2                 
## [135] readxl_1.3.1                lifecycle_1.0.1            
## [137] timeDate_3043.102           ggpattern_0.3.1            
## [139] munsell_0.5.0               cellranger_1.1.0           
## [141] R.methodsS3_1.8.1           proxyC_0.1.5               
## [143] visNetwork_2.0.9            caTools_1.18.0             
## [145] codetools_0.2-16            Biobase_2.46.0             
## [147] GenomeInfoDb_1.22.1         vipor_0.4.5                
## [149] lmtest_0.9-38               msigdbr_7.5.1              
## [151] htmlTable_1.13.3            triebeard_0.3.0            
## [153] lsei_1.2-0                  xtable_1.8-4               
## [155] ROCR_1.0-7                  BiocManager_1.30.10        
## [157] scatterplot3d_0.3-41        abind_1.4-5                
## [159] farver_2.0.3                parallelly_1.28.1          
## [161] RANN_2.6.1                  askpass_1.1                
## [163] GenomicRanges_1.38.0        RcppAnnoy_0.0.16           
## [165] tibble_3.1.5                ggdendro_0.1-20            
## [167] cluster_2.1.0               future.apply_1.5.0         
## [169] Seurat_3.1.5                dendextend_1.15.1          
## [171] Matrix_1.3-2                ellipsis_0.3.2             
## [173] prettyunits_1.1.1           lubridate_1.7.9            
## [175] ggridges_0.5.2              igraph_1.2.5               
## [177] RcppEigen_0.3.3.7.0         fgsea_1.12.0               
## [179] remotes_2.4.2               scBFA_1.0.0                
## [181] destiny_3.0.1               VIM_6.1.1                  
## [183] testthat_3.1.0              htmltools_0.5.2            
## [185] BiocFileCache_1.10.2        yaml_2.2.1                 
## [187] utf8_1.1.4                  plotly_4.9.2.1             
## [189] XML_3.99-0.3                ModelMetrics_1.2.2.2       
## [191] e1071_1.7-3                 foreign_0.8-76             
## [193] withr_2.5.0                 fitdistrplus_1.0-14        
## [195] BiocParallel_1.20.1         xgboost_1.4.1.1            
## [197] bit64_4.0.5                 foreach_1.5.0              
## [199] robustbase_0.93-9           Biostrings_2.54.0          
## [201] GOSemSim_2.13.1             rsvd_1.0.3                 
## [203] memoise_2.0.0               evaluate_0.18              
## [205] forcats_0.5.0               rio_0.5.16                 
## [207] geneplotter_1.64.0          tzdb_0.1.2                 
## [209] caret_6.0-86                ps_1.6.0                   
## [211] DiagrammeR_1.0.6.1          curl_4.3                   
## [213] fdrtool_1.2.15              fansi_0.4.1                
## [215] highr_0.8                   urltools_1.7.3             
## [217] xts_0.12.1                  GSEABase_1.48.0            
## [219] acepack_1.4.1               edgeR_3.28.1               
## [221] checkmate_2.0.0             scds_1.2.0                 
## [223] cachem_1.0.6                npsurv_0.4-0               
## [225] babelgene_22.3              rjson_0.2.20               
## [227] openxlsx_4.1.5              ggrepel_0.9.1              
## [229] clue_0.3-60                 rprojroot_2.0.2            
## [231] stabledist_0.7-1            tools_3.6.3                
## [233] sass_0.4.0                  nichenetr_1.1.1            
## [235] magrittr_2.0.1              RCurl_1.98-1.2             
## [237] proxy_0.4-24                car_3.0-11                 
## [239] ape_5.3                     ggplotify_0.0.5            
## [241] xml2_1.3.2                  httr_1.4.2                 
## [243] assertthat_0.2.1            rmarkdown_2.18             
## [245] boot_1.3-25                 globals_0.14.0             
## [247] R6_2.4.1                    Rhdf5lib_1.8.0             
## [249] nnet_7.3-14                 RcppHNSW_0.2.0             
## [251] progress_1.2.2              genefilter_1.68.0          
## [253] statmod_1.4.34              gtools_3.8.2               
## [255] shape_1.4.6                 HDF5Array_1.14.4           
## [257] BiocSingular_1.2.2          rhdf5_2.30.1               
## [259] splines_3.6.3               AUCell_1.8.0               
## [261] carData_3.0-4               colorspace_1.4-1           
## [263] generics_0.1.0              stats4_3.6.3               
## [265] base64enc_0.1-3             dynfeature_1.0.0           
## [267] smoother_1.1                gridtext_0.1.1             
## [269] pillar_1.6.3                tweenr_1.0.1               
## [271] sp_1.4-1                    ggplot.multistats_1.0.0    
## [273] rvcheck_0.1.8               GenomeInfoDbData_1.2.2     
## [275] plyr_1.8.6                  gtable_0.3.0               
## [277] zip_2.2.0                   knitr_1.41                 
## [279] ComplexHeatmap_2.14.0       latticeExtra_0.6-29        
## [281] biomaRt_2.42.1              IRanges_2.20.2             
## [283] fastmap_1.1.0               ADGofTest_0.3              
## [285] copula_1.0-0                doParallel_1.0.15          
## [287] AnnotationDbi_1.48.0        vcd_1.4-8                  
## [289] babelwhale_1.0.1            openssl_1.4.1              
## [291] scales_1.1.1                backports_1.2.1            
## [293] S4Vectors_0.24.4            ipred_0.9-12               
## [295] enrichplot_1.6.1            hms_1.1.1                  
## [297] ggforce_0.3.1               Rtsne_0.15                 
## [299] shiny_1.7.1                 numDeriv_2016.8-1.1        
## [301] polyclip_1.10-0             grid_3.6.3                 
## [303] lazyeval_0.2.2              Formula_1.2-3              
## [305] tsne_0.1-3                  crayon_1.3.4               
## [307] MASS_7.3-54                 pROC_1.16.2                
## [309] viridis_0.5.1               dynparam_1.0.0             
## [311] rpart_4.1-15                zinbwave_1.8.0             
## [313] compiler_3.6.3              ggtext_0.1.0
---
params:
  sample_name: "2021_31"
title: "HS project"
subtitle: "Sample `r params$sample_name`"
author: "Audrey"
date: "`r format(Sys.time(), '%Y-%m-%d')`"
output:
  html_document:
    code_folding: show
    code_download: true
    toc: true
    toc_float: true
    number_sections: false
---

<style>
body {
text-align: justify}
</style>

<!-- Automatically computes and prints in the output the running time for any code chunk -->
```{r, echo=FALSE}
# https://github.com/rstudio/rmarkdown/issues/1453
hooks = knitr::knit_hooks$get()
hook_foldable = function(type) {
  force(type)
  function(x, options) {
    res = hooks[[type]](x, options)
    
    if (isFALSE(options[[paste0("fold_", type)]])) return(res)
    
    paste0(
      "<details><summary>", "show", "</summary>\n\n",
      res,
      "\n\n</details>"
    )
  }
}
knitr::knit_hooks$set(
  output = hook_foldable("output"),
  plot = hook_foldable("plot"),
  time_it = local({
    now = NULL
    function(before, options) {
      if (options$time_it) {
        if (before) {
          now <<- Sys.time()
        } else {
          res = difftime(Sys.time(), now, units = "secs")
          paste("(Time to run :", round(res, digits = 2), "s)")
        }
      }
    }
  })
)
```

<!-- Set default parameters for all chunks -->
```{r, setup, include = FALSE}
set.seed(1337L)
knitr::opts_chunk$set(echo = TRUE, # display code
                      # display chunk output
                      message = FALSE,
                      warning = FALSE,
                      fold_output = FALSE, # usefull for sessionInfo()
                      fold_plot = FALSE,
                      
                      # figure settings
                      fig.align = 'center',
                      fig.width = 20,
                      fig.height = 15,
                      
                      # something about seed, chunk and Rmarkdown compilation
                      # https://stackoverflow.com/questions/39417003/long-vectors-not-supported-yet-error-in-rmd-but-not-in-r-script
                      # cache = TRUE,
                      cache.lazy = FALSE, 
                      
                      # add runtime after chunk
                      time_it = FALSE)
```


The goal of this script is to generate a Seurat object for sample `r params$sample_name`.

* removal of cells based on quality control metrics
* normalization with `LogNormalize`, then doublets detection using `scran hybrid` and `scDblFinder` method, and doublet cells removal
* normalization with `LogNormalize`, for only the remaining cells
* cell cycle and cell type annotation
* dimensionality reduction using `PCA`
* projection using `tSNE` and `UMAP`


```{r library}
library(dplyr)
library(patchwork)
library(ggplot2)

.libPaths()
```

# Preparation

In this section, we set the global settings of the analysis. We will store data there :

```{r out_dir}
out_dir = "."
```

We load the parameters :

```{r get_param}
sample_name = params$sample_name # "2021_31"
# sample_name = "2021_31"
```

Input count matrix is there :

```{r count_matrix_dir}
count_matrix_dir = paste0(out_dir, "/input/", sample_name)
```


We load the markers and specific colors for each cell type :

```{r cell_markers}
cell_markers = readRDS(paste0(out_dir, "/../1_metadata/hs_hd_cell_markers.rds"))
cell_markers = lapply(cell_markers, FUN = toupper)
lengths(cell_markers)
```

Here are custom colors for each cell type :

```{r color_markers, fig.width = 12, fig.height = 1, class.source = "fold-hide"}
color_markers = readRDS(paste0(out_dir, "/../1_metadata/hs_hd_color_markers.rds"))

data.frame(cell_type = names(color_markers),
           color = unlist(color_markers)) %>%
  ggplot2::ggplot(., aes(x = cell_type, y = 0, fill = cell_type)) +
  ggplot2::geom_point(pch = 21, size = 5) +
  ggplot2::scale_fill_manual(values = unlist(color_markers), breaks = names(color_markers)) +
  ggplot2::theme_classic() +
  ggplot2::theme(legend.position = "none",
                 axis.line = element_blank(),
                 axis.title = element_blank(),
                 axis.ticks = element_blank(),
                 axis.text.y = element_blank())
```


We load markers to display on the dotplot :

```{r dotplot_markers}
dotplot_markers = readRDS(paste0(out_dir, "/../1_metadata/hs_hd_dotplot_markers.rds"))
dotplot_markers = lapply(dotplot_markers, FUN = toupper)
dotplot_markers
```


We load metadata for this sample :

```{r sample_info}
sample_info = readRDS(paste0(out_dir, "/../1_metadata/hs_hd_sample_info.rds"))
sample_info %>%
  dplyr::filter(project_name == sample_name)
```


These is a parameter for different functions :

```{r global_settings}
cl = aquarius::create_parallel_instance(nthreads = 3L)
cut_log_nCount_RNA = 6
cut_nFeature_RNA = 500
cut_percent.mt = 20
cut_percent.rb = 50
```

# Load count matrix

In this section, we load the raw count matrix. Then, we applied an empty droplets filtering.

```{r load_count_matrix, time_it = TRUE}
sobj = aquarius::load_sc_data(data_path = count_matrix_dir,
                              sample_name = sample_name,
                              my_seed = 1337L)
sobj
```

In genes metadata, we add the Ensembl ID. The `sobj@assays$RNA@meta.features` dataframe contains three information :

* `rownames` : gene names stored as the dimnames of the count matrix. Duplicated gene names will have a `.1` at the end of their name
* `Ensembl_ID` : EnsemblID, as stored in the `features.tsv.gz` file
* `gene_name` : gene_name, as stored in the `features.tsv.gz` file. Duplicated gene names will have the same name.

```{r features_df}
features_df = read.csv(paste0(count_matrix_dir, "/features.tsv.gz"), sep = "\t", header = 0)
features_df = features_df[, c(1:2)]
colnames(features_df) = c("Ensembl_ID", "gene_name")
rownames(features_df) = rownames(sobj) # mandatory for Seurat::FindVariableFeatures

sobj@assays$RNA@meta.features = features_df
rm(features_df)

head(sobj@assays$RNA@meta.features)
```

We add the same columns as in metadata :

```{r add_metadata}
row_oi = (sample_info$project_name == sample_name)

sobj$project_name = sample_name
sobj$sample_identifier = sample_info[row_oi, "sample_identifier"]
sobj$sample_type = sample_info[row_oi, "sample_type"]

colnames(sobj@meta.data)
```

# Before filtering

## Normalization

```{r normalization}
sobj = Seurat::NormalizeData(sobj,
                             normalization.method = "LogNormalize",
                             assay = "RNA")

sobj = Seurat::FindVariableFeatures(sobj,
                                    assay = "RNA",
                                    nfeatures = 3000)
sobj
```

## Projection

We generate a tSNE to visualize cells before filtering.

```{r pca_before, fig.width = 12, fig.height = 4}
sobj = aquarius::dimensions_reduction(sobj = sobj,
                                      assay = "RNA",
                                      reduction = "pca",
                                      max_dims = 100,
                                      verbose = FALSE)
Seurat::ElbowPlot(sobj, ndims = 100, reduction = "RNA_pca")
```


We generate a tSNE with 20 principal components :

```{r tsne_before}
ndims = 20
sobj = Seurat::RunTSNE(sobj,
                       reduction = "RNA_pca",
                       dims = 1:ndims,
                       seed.use = 1337L,
                       reduction.name = paste0("RNA_pca_", ndims, "_tsne"))

sobj
```

## Cell type

We annotate cells for cell type using `Seurat::AddModuleScore` function.

```{r cell_annot_custom_short, time_it = TRUE}
sobj = aquarius::cell_annot_custom(sobj,
                                   newname = "cell_type",
                                   markers = cell_markers,
                                   use_negative = TRUE,
                                   add_score = TRUE,
                                   verbose = TRUE)

colnames(sobj@meta.data) = stringr::str_replace_all(string = colnames(sobj@meta.data),
                                                    pattern = " ",
                                                    replacement = "_")

sobj$cell_type = factor(sobj$cell_type, levels = names(cell_markers))

table(sobj$cell_type)
```

To justify cell type annotation, we can make a dotplot :

```{r dotplot_cell_type_short, fig.width = 12, fig.height = 9}
markers = c("PTPRC", "MSX2", "KRT16",
            unique(unlist(dotplot_markers[levels(sobj$cell_type)])))
markers = markers[markers %in% rownames(sobj)]

aquarius::plot_dotplot(sobj, assay = "RNA",
                       column_name = "cell_type",
                       markers = markers,
                       nb_hline = 0) +
  ggplot2::scale_color_gradientn(colors = aquarius:::color_gene) +
  ggplot2::theme(legend.position = "right",
                 legend.box = "vertical",
                 legend.direction = "vertical",
                 axis.title = element_blank(),
                 axis.text = element_text(size = 15))
```

We can make a barplot to see the composition of each dataset, and visualize cell types on the projection.

```{r barplot_celltype, fig.height = 6, fig.width = 8, class.source = "fold-hide"}
df_proportion = as.data.frame(prop.table(table(sobj$orig.ident,
                                               sobj$cell_type)))
colnames(df_proportion) = c("orig.ident", "cell_type", "freq")

quantif = table(sobj$orig.ident) %>%
  as.data.frame.table() %>%
  `colnames<-`(c("orig.ident", "nb_cells"))

# Plot
plot_list = list()

plot_list[[2]] = aquarius::plot_barplot(df = df_proportion,
                                        x = "orig.ident",
                                        y = "freq",
                                        fill = "cell_type",
                                        position = ggplot2::position_fill()) +
  ggplot2::scale_fill_manual(name = "Cell type",
                             values = color_markers[levels(df_proportion$cell_type)],
                             breaks = levels(df_proportion$cell_type)) +
  ggplot2::geom_label(data = quantif, inherit.aes = FALSE,
                      aes(x = orig.ident, y = 1.05, label = nb_cells),
                      label.size = 0)

plot_list[[1]] = Seurat::DimPlot(sobj, group.by = "cell_type") +
  ggplot2::scale_color_manual(values = unlist(color_markers),
                              breaks = names(color_markers)) +
  ggplot2::labs(title = sample_name,
                subtitle = paste0(ncol(sobj), " cells")) +
  Seurat::NoLegend() + Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

patchwork::wrap_plots(plot_list, nrow = 1, widths = c(6, 1))
```

## Cell cycle phase

We annotate cells for cell cycle phase using `Seurat` and `cyclone`.

```{r cell_cycle, time_it = TRUE}
cc_columns = aquarius::add_cell_cycle(sobj = sobj,
                                      assay = "RNA",
                                      species_rdx = "hs",
                                      BPPARAM = cl)@meta.data[, c("Seurat.Phase", "Phase")]

sobj$Seurat.Phase = cc_columns$Seurat.Phase
sobj$cyclone.Phase = cc_columns$Phase

table(sobj$Seurat.Phase, sobj$cyclone.Phase)
```

We visualize cell cycle on the projection :

```{r see_cell_cycle1, fig.width = 12, fig.height = 7, class.source = "fold-hide"}
plot_list = list()

plot_list[[2]] = Seurat::DimPlot(sobj, group.by = "Seurat.Phase") +
  ggplot2::labs(title = "Cell Cycle Phase",
                subtitle = "Seurat.Phase") +
  Seurat::NoLegend() + Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

plot_list[[1]] = Seurat::DimPlot(sobj, group.by = "cyclone.Phase") +
  ggplot2::labs(title = "Cell Cycle Phase",
                subtitle = "cyclone.Phase") +
  Seurat::NoLegend() + Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

patchwork::wrap_plots(plot_list, nrow = 1)
```

# Quality control

In this section, we look at the number of genes expressed by each cell, the number of UMI, the percentage of mitochondrial genes expressed, and the percentage of ribosomal genes expressed. Then, without taking into account the cells expressing low number of genes or have low number of UMI, we identify doublet cells.

We compute four quality metrics :

```{r qc_metrics}
sobj = Seurat::PercentageFeatureSet(sobj, pattern = "^MT", col.name = "percent.mt")
sobj = Seurat::PercentageFeatureSet(sobj, pattern = "^RP[L|S][0-9]*$", col.name = "percent.rb")

head(sobj@meta.data)
```

We get the cell barcodes for the failing cells :

```{r failed}
fail_percent.mt = sobj@meta.data %>% dplyr::filter(percent.mt > cut_percent.mt) %>% rownames()
fail_percent.rb = sobj@meta.data %>% dplyr::filter(percent.rb > cut_percent.rb) %>% rownames()
fail_log_nCount_RNA = sobj@meta.data %>% dplyr::filter(log_nCount_RNA < cut_log_nCount_RNA) %>% rownames()
fail_nFeature_RNA = sobj@meta.data %>% dplyr::filter(nFeature_RNA < cut_nFeature_RNA) %>% rownames()
```


## Doublet cells detection

Without taking into account the low UMI and low number of features cells, we identify doublets.

```{r fsobj}
fsobj = subset(sobj, invert = TRUE,
               cells = unique(c(fail_log_nCount_RNA, fail_nFeature_RNA)))
fsobj
```

On this filtered dataset, we apply doublet cells detection. Just before, we run the normalization, taking into account only the remaining cells.

```{r normalization_doublets}
sobj = Seurat::NormalizeData(sobj,
                             normalization.method = "LogNormalize",
                             assay = "RNA")

sobj = Seurat::FindVariableFeatures(sobj,
                                    assay = "RNA",
                                    nfeatures = 3000)
sobj
```

We identify doublet cells :

```{r doublet_detection, time_it = TRUE}
fsobj = aquarius::find_doublets(sobj = fsobj,
                                BPPARAM = cl)

fail_doublets_consensus = Seurat::WhichCells(fsobj, expression = doublets_consensus.class)
fail_doublets_scDblFinder = Seurat::WhichCells(fsobj, expression = scDblFinder.class)
fail_doublets_hybrid = Seurat::WhichCells(fsobj, expression = hybrid_score.class)
```

We add the information in the non filtered Seurat object :

```{r add_doublet_metrics}
sobj$doublets_consensus.class = dplyr::case_when(!(colnames(sobj) %in% colnames(fsobj)) ~ NA,
                                                 colnames(sobj) %in% fail_doublets_consensus ~ TRUE,
                                                 !(colnames(sobj) %in% fail_doublets_consensus) ~ FALSE)

sobj$scDblFinder.class = dplyr::case_when(!(colnames(sobj) %in% colnames(fsobj)) ~ NA,
                                          colnames(sobj) %in% fail_doublets_scDblFinder ~ TRUE,
                                          !(colnames(sobj) %in% fail_doublets_scDblFinder) ~ FALSE)

sobj$hybrid_score.class = dplyr::case_when(!(colnames(sobj) %in% colnames(fsobj)) ~ NA,
                                           colnames(sobj) %in% fail_doublets_hybrid ~ TRUE,
                                           !(colnames(sobj) %in% fail_doublets_hybrid) ~ FALSE)
```

```{r clean_fsobj, echo = FALSE}
rm(fsobj)
```


## Quality control representation

We can visualize the 4 cells quality with a Venn diagram : 

```{r qc_venn, class.source = "fold-hide", fig.width = 8, fig.height = 6}
n_filtered = c(fail_percent.mt, fail_percent.rb, fail_log_nCount_RNA, fail_nFeature_RNA) %>%
  unique() %>% length()
percent_filtered = round(100*(n_filtered/ncol(sobj)), 2)

ggvenn::ggvenn(list(percent.mt = fail_percent.mt,
                    percent.rb = fail_percent.rb,
                    log_nCount_RNA = fail_log_nCount_RNA,
                    nFeature_RNA = fail_nFeature_RNA), 
               fill_color = c("#0073C2FF", "#EFC000FF", "orange", "pink"),
               stroke_size = 0.5, set_name_size = 4) +
  ggplot2::labs(title = "Filtered out cells",
                subtitle = paste0(n_filtered, " cells (", percent_filtered, " % of all cells)")) +
  ggplot2::theme(plot.title = element_text(hjust = 0.5, face = "bold"),
                 plot.subtitle = element_text(hjust = 0.5))
```


### Number of UMI

To visualize the threshold for number of UMI, we can make a histogram :

```{r qc_umi_hist, class.source = "fold-hide", fig.width = 10, fig.height = 5}
aquarius::plot_qc_density(df = sobj@meta.data,
                          x = "log_nCount_RNA",
                          bins = 200,
                          group_by = "orig.ident",
                          group_color = setNames(sample_info$color,
                                                 nm = sample_info$sample_identifiant),
                          x_thresh = cut_log_nCount_RNA)
```

```{r vln_umi_cell_type, fig.width = 12, fig.height = 6}
Seurat::VlnPlot(sobj, features = "log_nCount_RNA", pt.size = 0.001,
                group.by = "cell_type", cols = color_markers) +
  ggplot2::scale_fill_manual(values = color_markers, breaks = names(color_markers)) +
  ggplot2::geom_hline(yintercept = cut_log_nCount_RNA, col = "red") +
  ggplot2::labs(x = "")
```

```{r qc_umi_proj, fig.width = 8, fig.height = 7}
sobj$fail = ifelse(colnames(sobj) %in% fail_log_nCount_RNA,
                   yes = as.character(sobj$cell_type), no = NA)
sobj$fail = factor(sobj$fail, levels = c(levels(sobj$cell_type), NA))

Seurat::DimPlot(sobj, group.by = "fail", na.value = "gray80", cols = color_markers) +
  ggplot2::labs(title = "log_nCount_RNA",
                subtitle = paste0(length(fail_log_nCount_RNA), " cells")) +
  Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))
```

### Number of features

To visualize the threshold for number of features, we can make a histogram :

```{r qc_features_hist, class.source = "fold-hide", fig.width = 10, fig.height = 5}
aquarius::plot_qc_density(df = sobj@meta.data,
                          x = "nFeature_RNA",
                          bins = 200,
                          group_by = "orig.ident",
                          group_color = setNames(sample_info$color,
                                                 nm = sample_info$sample_identifiant),
                          x_thresh = cut_nFeature_RNA)
```


```{r vln_features_cell_type, fig.width = 12, fig.height = 6}
Seurat::VlnPlot(sobj, features = "nFeature_RNA", pt.size = 0.001,
                group.by = "cell_type", cols = color_markers) +
  ggplot2::scale_fill_manual(values = color_markers, breaks = names(color_markers)) +
  ggplot2::geom_hline(yintercept = cut_nFeature_RNA, col = "red") +
  ggplot2::labs(x = "")
```

```{r qc_features_proj, fig.width = 8, fig.height = 7}
sobj$fail = ifelse(colnames(sobj) %in% fail_nFeature_RNA,
                   yes = as.character(sobj$cell_type), no = NA)
sobj$fail = factor(sobj$fail, levels = c(levels(sobj$cell_type), NA))

Seurat::DimPlot(sobj, group.by = "fail", na.value = "gray80", cols = color_markers) +
  ggplot2::labs(title = "nFeature_RNA",
                subtitle = paste0(length(fail_nFeature_RNA), " cells")) +
  Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))
```

### Mitochondrial genes expression

To identify a threshold for mitochondrial gene expression, we can make a histogram :

```{r qc_mito_hist, class.source = "fold-hide", fig.width = 10, fig.height = 5}
aquarius::plot_qc_density(df = sobj@meta.data,
                          x = "percent.mt",
                          bins = 200,
                          group_by = "orig.ident",
                          group_color = setNames(sample_info$color,
                                                 nm = sample_info$sample_identifiant),
                          x_thresh = cut_percent.mt)
```

```{r vln_percentmt_cell_type, fig.width = 12, fig.height = 6}
Seurat::VlnPlot(sobj, features = "percent.mt", pt.size = 0.001,
                group.by = "cell_type", cols = color_markers) +
  ggplot2::scale_fill_manual(values = color_markers, breaks = names(color_markers)) +
  ggplot2::geom_hline(yintercept = cut_percent.mt, col = "red") +
  ggplot2::labs(x = "")
```

```{r qc_mito_proj, fig.width = 8, fig.height = 7}
sobj$fail = ifelse(colnames(sobj) %in% fail_percent.mt,
                   yes = as.character(sobj$cell_type), no = NA)
sobj$fail = factor(sobj$fail, levels = c(levels(sobj$cell_type), NA))

Seurat::DimPlot(sobj, group.by = "fail", na.value = "gray80", cols = color_markers) +
  ggplot2::labs(title = "percent.mt",
                subtitle = paste0(length(fail_percent.mt), " cells")) +
  Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))
```

### Ribosomal genes expression

To identify a threshold for ribosomal gene expression, we can make a histogram :

```{r qc_ribo_hist, class.source = "fold-hide", fig.width = 10, fig.height = 5}
aquarius::plot_qc_density(df = sobj@meta.data,
                          x = "percent.rb",
                          bins = 200,
                          group_by = "orig.ident",
                          group_color = setNames(sample_info$color,
                                                 nm = sample_info$sample_identifiant),
                          x_thresh = cut_percent.rb)
```


```{r vln_percentrb_cell_type, fig.width = 12, fig.height = 6}
Seurat::VlnPlot(sobj, features = "percent.rb", pt.size = 0.001,
                group.by = "cell_type", cols = color_markers) +
  ggplot2::scale_fill_manual(values = color_markers, breaks = names(color_markers)) +
  ggplot2::geom_hline(yintercept = cut_percent.rb, col = "red") +
  ggplot2::labs(x = "")
```

```{r qc_ribo_proj, fig.width = 8, fig.height = 7}
sobj$fail = ifelse(colnames(sobj) %in% fail_percent.rb,
                   yes = as.character(sobj$cell_type), no = NA)
sobj$fail = factor(sobj$fail, levels = c(levels(sobj$cell_type), NA))

Seurat::DimPlot(sobj, group.by = "fail", na.value = "gray80", cols = color_markers) +
  ggplot2::labs(title = "percent.rb",
                subtitle = paste0(length(fail_percent.rb), " cells")) +
  Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))
```

### FACS-like figure

We would like to see if the number of feature expressed by cell, and the number of UMI is correlated with the cell type, the percentage of mitochondrial and ribosomal gene expressed, and the doublet status. We build the `log_nCount_RNA` by `nFeature_RNA` figure, where cells (dots) are colored by these different metrics.

This is the figure, colored by cell type :

```{r qc_patchwork_cell_type, class.source = "fold-hide", fig.width = 10, fig.height = 8}
aquarius::plot_qc_facslike(df = sobj@meta.data,
                           x = "nFeature_RNA",
                           y = "log_nCount_RNA",
                           col_by = "cell_type",
                           col_colors = unname(color_markers),
                           x_thresh = cut_nFeature_RNA,
                           y_thresh = cut_log_nCount_RNA,
                           bins = 200)
```

This is the figure, colored by the percentage of mitochondrial genes expressed in cell :

```{r qc_patchwork_mito, class.source = "fold-hide", fig.width = 10, fig.height = 8}
aquarius::plot_qc_facslike(df = sobj@meta.data,
                           x = "nFeature_RNA",
                           y = "log_nCount_RNA",
                           col_by = "percent.mt",
                           x_thresh = cut_nFeature_RNA,
                           y_thresh = cut_log_nCount_RNA,
                           bins = 200)
```

This is the figure, colored by the percentage of ribosomal genes expressed in cell :

```{r qc_patchwork_ribo, class.source = "fold-hide", fig.width = 10, fig.height = 8}
aquarius::plot_qc_facslike(df = sobj@meta.data,
                           x = "nFeature_RNA",
                           y = "log_nCount_RNA",
                           col_by = "percent.rb",
                           x_thresh = cut_nFeature_RNA,
                           y_thresh = cut_log_nCount_RNA,
                           bins = 200)
```

This is the figure, colored by the doublet cells status (`doublets_consensus.class`) :

```{r qc_patchwork_doublet_consensus, class.source = "fold-hide", fig.width = 10, fig.height = 8}
aquarius::plot_qc_facslike(df = sobj@meta.data,
                           x = "nFeature_RNA",
                           y = "log_nCount_RNA",
                           col_by = "doublets_consensus.class",
                           col_colors = setNames(nm = c(TRUE, FALSE),
                                                 aquarius::gg_color_hue(2)),
                           x_thresh = cut_nFeature_RNA,
                           y_thresh = cut_log_nCount_RNA,
                           bins = 200)
```

This is the figure, colored by the doublet cells status (`scDblFinder.class`) :

```{r qc_patchwork_scdblfinder, class.source = "fold-hide", fig.width = 10, fig.height = 8}
aquarius::plot_qc_facslike(df = sobj@meta.data,
                           x = "nFeature_RNA",
                           y = "log_nCount_RNA",
                           col_by = "scDblFinder.class",
                           col_colors = setNames(nm = c(TRUE, FALSE),
                                                 aquarius::gg_color_hue(2)),
                           x_thresh = cut_nFeature_RNA,
                           y_thresh = cut_log_nCount_RNA,
                           bins = 200)
```

This is the figure, colored by the doublet cells status (`hybrid_score.class`) :

```{r qc_patchwork_hybrid_score, class.source = "fold-hide", fig.width = 10, fig.height = 8}
aquarius::plot_qc_facslike(df = sobj@meta.data,
                           x = "nFeature_RNA",
                           y = "log_nCount_RNA",
                           col_by = "hybrid_score.class",
                           col_colors = setNames(nm = c(TRUE, FALSE),
                                                 aquarius::gg_color_hue(2)),
                           x_thresh = cut_nFeature_RNA,
                           y_thresh = cut_log_nCount_RNA,
                           bins = 200)
```


### Visualization as piechart

Do filtered cells belong to a particular cell type ?

```{r qc_piechart_cell_type, class.source = "fold-hide", fig.width = 12, fig.height = 9}
sobj$all_cells = TRUE

plot_list = list()

## All cells
df = sobj@meta.data
if (nrow(df) == 0) {
  plot_list[[1]] = ggplot()
} else {
  plot_list[[1]] = aquarius::plot_piechart(df = df,
                                           logical_var = "all_cells",
                                           grouping_var = "cell_type",
                                           colors = color_markers,
                                           display_legend = TRUE) +
    ggplot2::labs(title = "All cells",
                  subtitle = paste(nrow(df), "cells")) +
    ggplot2::theme(plot.title = element_text(hjust = 0.5, face = "bold"),
                   plot.subtitle = element_text(hjust = 0.5))
}

## Doublets consensus
df = sobj@meta.data %>%
  dplyr::filter(doublets_consensus.class)
if (nrow(df) == 0) {
  plot_list[[2]] = ggplot()
} else {
  plot_list[[2]] = aquarius::plot_piechart(df = df,
                                           logical_var = "all_cells",
                                           grouping_var = "cell_type",
                                           colors = color_markers,
                                           display_legend = TRUE) +
    ggplot2::labs(title = "doublets_consensus.class",
                  subtitle = paste(sum(sobj$doublets_consensus.class, na.rm = TRUE), "cells")) +
    ggplot2::theme(plot.title = element_text(hjust = 0.5, face = "bold"),
                   plot.subtitle = element_text(hjust = 0.5))
}

## percent.mt
df = sobj@meta.data %>%
  dplyr::filter(percent.mt > cut_percent.mt)
if (nrow(df) == 0) {
  plot_list[[3]] = ggplot()
} else {
  plot_list[[3]] = aquarius::plot_piechart(df = df,
                                           logical_var = "all_cells",
                                           grouping_var = "cell_type",
                                           colors = color_markers,
                                           display_legend = TRUE) +
    ggplot2::labs(title = paste("percent.mt >", cut_percent.mt),
                  subtitle = paste(length(fail_percent.mt), "cells")) +
    ggplot2::theme(plot.title = element_text(hjust = 0.5, face = "bold"),
                   plot.subtitle = element_text(hjust = 0.5))
}

## percent.rb
df = sobj@meta.data %>%
  dplyr::filter(percent.rb > cut_percent.rb)
if (nrow(df) == 0) {
  plot_list[[4]] = ggplot()
} else {
  plot_list[[4]] = aquarius::plot_piechart(df = df,
                                           logical_var = "all_cells",
                                           grouping_var = "cell_type",
                                           colors = color_markers,
                                           display_legend = TRUE) +
    ggplot2::labs(title = paste("percent.rb >", cut_percent.rb),
                  subtitle = paste(length(fail_percent.rb), "cells")) +
    ggplot2::theme(plot.title = element_text(hjust = 0.5, face = "bold"),
                   plot.subtitle = element_text(hjust = 0.5))
}

## log_nCount_RNA
df = sobj@meta.data %>%
  dplyr::filter(log_nCount_RNA < cut_log_nCount_RNA)
if (nrow(df) == 0) {
  plot_list[[5]] = ggplot()
} else {
  plot_list[[5]] = aquarius::plot_piechart(df = df,
                                           logical_var = "all_cells",
                                           grouping_var = "cell_type",
                                           colors = color_markers,
                                           display_legend = TRUE) +
    ggplot2::labs(title = paste("log_nCount_RNA <", round(cut_log_nCount_RNA, 2)),
                  subtitle = paste(length(fail_log_nCount_RNA), "cells")) +
    ggplot2::theme(plot.title = element_text(hjust = 0.5, face = "bold"),
                   plot.subtitle = element_text(hjust = 0.5))
}

## nFeature_RNA
df = sobj@meta.data %>%
  dplyr::filter(nFeature_RNA < cut_nFeature_RNA)
if (nrow(df) == 0) {
  plot_list[[6]] = ggplot()
} else {
  plot_list[[6]] = aquarius::plot_piechart(df = df,
                                           logical_var = "all_cells",
                                           grouping_var = "cell_type",
                                           colors = color_markers,
                                           display_legend = TRUE) +
    ggplot2::labs(title = paste("nFeature_RNA <", round(cut_nFeature_RNA, 2)),
                  subtitle = paste(length(fail_nFeature_RNA), "cells")) +
    ggplot2::theme(plot.title = element_text(hjust = 0.5, face = "bold"),
                   plot.subtitle = element_text(hjust = 0.5))
}

patchwork::wrap_plots(plot_list, ncol = 3) +
  patchwork::plot_layout(guides = "collect") &
  ggplot2::theme(legend.position = "right")
```

```{r clean_qc_4, echo = FALSE}
sobj$all_cells = NULL

rm(plot_list, df)
```


### Doublet cells status

We can compare doublet detection methods with a Venn diagram : 

```{r qc_venn_doublet, class.source = "fold-hide", fig.width = 8, fig.height = 6}
ggvenn::ggvenn(list(hybrid = fail_doublets_hybrid,
                    scDblFinder = fail_doublets_scDblFinder), 
               fill_color = c("#0073C2FF", "#EFC000FF"),
               stroke_size = 0.5, set_name_size = 4) +
  ggplot2::ggtitle(label = "Doublet cells") +
  ggplot2::theme(plot.title = element_text(hjust = 0.5, face = "bold"))
```

We visualize cells annotation for doublets :

```{r doublets_proj, fig.width = 12, fig.height = 6, class.source = "fold-hide"}
plot_list = list()

# scDblFinder.class
sobj$fail = ifelse(sobj$scDblFinder.class,
                   yes = as.character(sobj$cell_type), no = NA)
sobj$fail = factor(sobj$fail, levels = c(levels(sobj$cell_type), NA))

plot_list[[1]] = Seurat::DimPlot(sobj, group.by = "fail",
                                 na.value = "gray80", cols = color_markers) +
  ggplot2::labs(title = "scDblFinder.class",
                subtitle = paste0(sum(sobj$scDblFinder.class, na.rm = TRUE), " cells")) +
  Seurat::NoAxes() + Seurat::NoLegend() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

# hybrid_score.class
sobj$fail = ifelse(sobj$hybrid_score.class,
                   yes = as.character(sobj$cell_type), no = NA)
sobj$fail = factor(sobj$fail, levels = c(levels(sobj$cell_type), NA))

plot_list[[2]] = Seurat::DimPlot(sobj, group.by = "fail",
                                 na.value = "gray80", cols = color_markers) +
  ggplot2::labs(title = "hybrid_score.class",
                subtitle = paste0(sum(sobj$hybrid_score.class, na.rm = TRUE), " cells")) +
  Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

sobj$fail = NULL

# Plot
patchwork::wrap_plots(plot_list, nrow = 1)
```

What is the composition of doublet cells ? We just look at score for each cell type.

```{r doublets_composition, class.source = "fold-hide"}
sobj$orig.ident.doublets = case_when(is.na(sobj$doublets_consensus.class) ~ "bad quality",
                                     sobj$doublets_consensus.class == TRUE ~ paste0(sobj$orig.ident, " doublets"),
                                     sobj$doublets_consensus.class == FALSE ~ "not doublet")
sobj$orig.ident.doublets = factor(sobj$orig.ident.doublets,
                                  levels = c(paste0(as.character(sample_info$sample_identifiant), " doublets"),
                                             "bad quality", "not doublet"))

doublets_compo = function(score1, score2) {
  type1 = unlist(lapply(stringr::str_split(score1, pattern = "score_"), `[[`, 2))
  type2 = unlist(lapply(stringr::str_split(score2, pattern = "score_"), `[[`, 2))
  
  if (type1 == type2) {
    the_title = "Homotypic doublet"
    the_subtitle = type1
    score1 = "log_nCount_RNA"
  } else {
    the_title = "Heterotypic doublet"
    the_subtitle = paste(type1, type2, sep = " + ")
  }
  
  p = sobj@meta.data %>%
    dplyr::arrange(desc(orig.ident.doublets)) %>%
    ggplot2::ggplot(., aes(x = eval(parse(text = score1)),
                           y = eval(parse(text = score2)),
                           col = orig.ident.doublets)) +
    ggplot2::geom_point(size = 0.25) +
    ggplot2::scale_color_manual(values = c(sample_info$color, "gray90", "gray60"),
                                breaks = c(paste0(as.character(sample_info$sample_identifiant), " doublets"),
                                           "bad quality", "not doublet")) +
    ggplot2::labs(x = score1, y = score2,
                  title = the_title, subtitle = the_subtitle) +
    ggplot2::theme_classic() +
    ggplot2::theme(aspect.ratio = 1,
                   plot.title = element_text(hjust = 0.5),
                   plot.subtitle = element_text(hjust = 0.5))
  
  return(p)
}
```

```{r doublets_patchwork_prep, class.source = "fold-hide"}
score_columns = grep(x = colnames(sobj@meta.data),
                     pattern = "^score",
                     value = TRUE)
combinations = expand.grid(score_columns, score_columns) %>%
  apply(., 1, sort) %>% t() %>%
  as.data.frame()
combinations = combinations[!duplicated(combinations), ]

plot_list = apply(combinations, 1, FUN = function(elem) {
  doublets_compo(elem[1], elem[2])
})

sobj$orig.ident.doublets = NULL
```


```{r doublets_patchwork, fig.width = 12, fig.height = 80, fold_plot = TRUE}
patchwork::wrap_plots(plot_list, ncol = 4) +
  patchwork::plot_layout(guides = "collect") &
  ggplot2::theme(legend.position = "right")
```

## Save

We could save this object before filtering (remove `eval = FALSE`) :

```{r save_sobj_unfiltered_annotated, eval = FALSE}
saveRDS(sobj, paste0(out_dir, "/datasets/", sample_name, "_sobj_unfiltered.rds"))
```


# Filtering

We remove :

* cells with a number of UMI lower than `r cut_log_nCount_RNA`
* cells expressing a number of genes lower than `r cut_nFeature_RNA`
* cells having more than `r cut_percent.mt` \% of UMI related to mitochondrial genes
* cells having more than `r cut_percent.rb` \% of UMI related to ribosomal genes
* doublet cells detected with both method (**scDblFinder** and **scds-hybrid**)


```{r filter_cells}
sobj = subset(sobj, invert = TRUE,
              cells = unique(c(fail_log_nCount_RNA, fail_nFeature_RNA,
                               fail_percent.mt, fail_percent.rb,
                               fail_doublets_consensus)))
sobj
```

```{r clean_filter, echo = FALSE}
rm(fail_doublets_consensus, fail_doublets_scDblFinder, fail_doublets_hybrid,
   fail_percent.mt, fail_percent.rb, fail_log_nCount_RNA, fail_nFeature_RNA,
   cut_percent.mt, cut_percent.rb, cut_log_nCount_RNA, cut_nFeature_RNA)
```


# Post-filtering processing

## Normalization

We normalize the count matrix for remaining cells :

```{r normalization_3}
sobj = Seurat::NormalizeData(sobj,
                             normalization.method = "LogNormalize",
                             assay = "RNA")

sobj = Seurat::FindVariableFeatures(sobj,
                                    assay = "RNA",
                                    nfeatures = 3000)
sobj
```

## Projection

We perform a PCA :

```{r pca, fig.width = 12, fig.height = 4}
sobj = aquarius::dimensions_reduction(sobj = sobj,
                                      assay = "RNA",
                                      reduction = "pca",
                                      max_dims = 100,
                                      verbose = FALSE)
Seurat::ElbowPlot(sobj, ndims = 100, reduction = "RNA_pca")
```


We generate a tSNE and a UMAP with 20 principal components :

```{r tsne_umap}
ndims = 20
sobj = Seurat::RunTSNE(sobj,
                       reduction = "RNA_pca",
                       dims = 1:ndims,
                       seed.use = 1337L,
                       reduction.name = paste0("RNA_pca_", ndims, "_tsne"))

sobj = Seurat::RunUMAP(sobj,
                       reduction = "RNA_pca",
                       dims = 1:ndims,
                       seed.use = 1337L,
                       reduction.name = paste0("RNA_pca_", ndims, "_umap"))
```


## Annotation

We annotate cells for cell type, with the new normalized expression matrix :

```{r cell_type_2, time_it = TRUE}
score_columns = grep(x = colnames(sobj@meta.data), pattern = "^score", value = TRUE)
sobj@meta.data[, score_columns] = NULL
sobj$cell_type = NULL

sobj = aquarius::cell_annot_custom(sobj,
                                   newname = "cell_type",
                                   markers = cell_markers,
                                   use_negative = TRUE,
                                   add_score = TRUE,
                                   verbose = TRUE)

sobj$cell_type = factor(sobj$cell_type, levels = names(cell_markers))

colnames(sobj@meta.data) = stringr::str_replace_all(string = colnames(sobj@meta.data),
                                                    pattern = " ",
                                                    replacement = "_")

table(sobj$cell_type)
```


To justify cell type annotation, we can make a dotplot :

```{r dotplot_cell_type_short2, fig.width = 12, fig.height = 9}
markers = c("PTPRC", unique(unlist(dotplot_markers[levels(sobj$cell_type)])))
markers = markers[markers %in% rownames(sobj)]

aquarius::plot_dotplot(sobj, assay = "RNA",
                       column_name = "cell_type",
                       markers = markers,
                       nb_hline = 0) +
  ggplot2::scale_color_gradientn(colors = aquarius:::color_gene) +
  ggplot2::theme(legend.position = "right",
                 legend.box = "vertical",
                 legend.direction = "vertical",
                 axis.title = element_blank(),
                 axis.text = element_text(size = 15))
```

We can make a barplot to see the composition of each dataset, and visualize cell types on the projection.

```{r barplot_celltype2, fig.height = 6, fig.width = 8, class.source = "fold-hide"}
df_proportion = as.data.frame(prop.table(table(sobj$orig.ident,
                                               sobj$cell_type)))
colnames(df_proportion) = c("orig.ident", "cell_type", "freq")

quantif = table(sobj$orig.ident) %>%
  as.data.frame.table() %>%
  `colnames<-`(c("orig.ident", "nb_cells"))

# Plot
plot_list = list()

plot_list[[2]] = aquarius::plot_barplot(df = df_proportion,
                                        x = "orig.ident",
                                        y = "freq",
                                        fill = "cell_type",
                                        position = ggplot2::position_fill()) +
  ggplot2::scale_fill_manual(name = "Cell type",
                             values = color_markers[levels(df_proportion$cell_type)],
                             breaks = levels(df_proportion$cell_type)) +
  ggplot2::geom_label(data = quantif, inherit.aes = FALSE,
                      aes(x = orig.ident, y = 1.05, label = nb_cells),
                      label.size = 0)

plot_list[[1]] = Seurat::DimPlot(sobj, group.by = "cell_type",
                                 reduction = "RNA_pca_20_tsne") +
  ggplot2::scale_color_manual(values = unlist(color_markers),
                              breaks = names(color_markers)) +
  ggplot2::labs(title = sample_name,
                subtitle = paste0(ncol(sobj), " cells")) +
  Seurat::NoLegend() + Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

patchwork::wrap_plots(plot_list, nrow = 1, widths = c(6, 1))
```


### Cell cycle

We annotate cells for cell cycle phase :

```{r cell_cycle2, time_it = TRUE}
cc_columns = aquarius::add_cell_cycle(sobj = sobj,
                                      assay = "RNA",
                                      species_rdx = "hs",
                                      BPPARAM = cl)@meta.data[, c("Seurat.Phase", "Phase")]

sobj$Seurat.Phase = cc_columns$Seurat.Phase
sobj$cyclone.Phase = cc_columns$Phase

table(sobj$Seurat.Phase, sobj$cyclone.Phase)
```


We visualize cell cycle on the projection :

```{r see_cell_cycle2, fig.width = 12, fig.height = 7, class.source = "fold-hide"}
plot_list = list()

plot_list[[2]] = Seurat::DimPlot(sobj, group.by = "Seurat.Phase",
                                 reduction = "RNA_pca_20_tsne") +
  ggplot2::labs(title = "Cell Cycle Phase",
                subtitle = "Seurat.Phase") +
  Seurat::NoLegend() + Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

plot_list[[1]] = Seurat::DimPlot(sobj, group.by = "cyclone.Phase",
                                 reduction = "RNA_pca_20_tsne") +
  ggplot2::labs(title = "Cell Cycle Phase",
                subtitle = "cyclone.Phase") +
  Seurat::NoLegend() + Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

patchwork::wrap_plots(plot_list, nrow = 1)
```


## Clustering

We make a highly resolutive clustering :

```{r clustering}
sobj = Seurat::FindNeighbors(sobj, reduction = "RNA_pca", dims = c(1:ndims))
sobj = Seurat::FindClusters(sobj, resolution = 2)

table(sobj$seurat_clusters)
```


# Visualization

## Cell type

We can visualize the cell type :

```{r see_cell_type, fig.width = 12, fig.height = 6, class.source = "fold-hide"}
tsne = Seurat::DimPlot(sobj, group.by = "cell_type",
                       reduction = paste0("RNA_pca_", ndims, "_tsne"), cols = color_markers) +
  Seurat::NoAxes() + ggplot2::ggtitle("tSNE") +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 legend.position = "none")

umap = Seurat::DimPlot(sobj, group.by = "cell_type",
                       reduction = paste0("RNA_pca_", ndims, "_umap"), cols = color_markers) +
  Seurat::NoAxes() + ggplot2::ggtitle("UMAP") +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5))

tsne | umap
```

## Cell cycle

We can visualize the cell cycle, from Seurat :

```{r see_cc_Seurat, fig.width = 12, fig.height = 6, class.source = "fold-hide"}
tsne = Seurat::DimPlot(sobj, group.by = "Seurat.Phase",
                       reduction = paste0("RNA_pca_", ndims, "_tsne")) +
  Seurat::NoAxes() + ggplot2::ggtitle("tSNE") +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 legend.position = "none")

umap = Seurat::DimPlot(sobj, group.by = "Seurat.Phase",
                       reduction = paste0("RNA_pca_", ndims, "_umap")) +
  Seurat::NoAxes() + ggplot2::ggtitle("UMAP") +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5))

tsne | umap
```


We can visualize the cell cycle, from cyclone :

```{r see_cc_cyclone, fig.width = 12, fig.height = 6, class.source = "fold-hide"}
tsne = Seurat::DimPlot(sobj, group.by = "cyclone.Phase",
                       reduction = paste0("RNA_pca_", ndims, "_tsne")) +
  Seurat::NoAxes() + ggplot2::ggtitle("tSNE") +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 legend.position = "none")

umap = Seurat::DimPlot(sobj, group.by = "cyclone.Phase",
                       reduction = paste0("RNA_pca_", ndims, "_umap")) +
  Seurat::NoAxes() + ggplot2::ggtitle("UMAP") +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5))

tsne | umap
```


## Clusters

We visualize the clustering :

```{r see_clusters, fig.width = 12, fig.height = 6, class.source = "fold-hide"}
tsne = Seurat::DimPlot(sobj, group.by = "seurat_clusters", label = TRUE,
                       reduction = paste0("RNA_pca_", ndims, "_tsne")) +
  Seurat::NoAxes() + ggplot2::ggtitle("tSNE") +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 legend.position = "none")

umap = Seurat::DimPlot(sobj, group.by = "seurat_clusters", label = TRUE,
                       reduction = paste0("RNA_pca_", ndims, "_umap")) +
  Seurat::NoAxes() + ggplot2::ggtitle("UMAP") +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5))

tsne | umap
```


## Gene expression

We visualize all cell types markers on the tSNE :

```{r plot_genes, fig.width = 12, fig.height = 20}
markers = dotplot_markers %>% unlist() %>% unname()
markers = markers[markers %in% rownames(sobj)]

plot_list = lapply(markers,
                   FUN = function(one_gene) {
                     p = Seurat::FeaturePlot(sobj, features = one_gene,
                                             reduction = paste0("RNA_pca_", ndims, "_tsne")) +
                       ggplot2::labs(title = one_gene) +
                       ggplot2::scale_color_gradientn(colors = aquarius::color_gene) +
                       ggplot2::theme(aspect.ratio = 1,
                                      plot.subtitle = element_text(hjust = 0.5)) +
                       Seurat::NoAxes()
                     return(p)
                   })

patchwork::wrap_plots(plot_list, ncol = 4)
```


# Save

We save the annotated and filtered Seurat object :

```{r save_sobj_filtered_annotated}
saveRDS(sobj, file = paste0(out_dir, "/datasets/", sample_name, "_sobj_filtered.rds"))
```


# R session

```{r sessioninfo, echo = FALSE, fold_output = TRUE}
sessionInfo()
```
